From fe0e66564ecab9627ba9313ab7c116b586b7cf19 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 14 Jul 2021 22:39:20 +0200
Subject: [PATCH 001/143] Declare N_THREADS as *4 for compatibility of
 INTERFACE64 builds with LLVM libomp

---
 lapack-netlib/TESTING/EIG/cchkee.F | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/EIG/cchkee.F b/lapack-netlib/TESTING/EIG/cchkee.F
index de4aed696..ab54078a3 100644
--- a/lapack-netlib/TESTING/EIG/cchkee.F
+++ b/lapack-netlib/TESTING/EIG/cchkee.F
@@ -1075,7 +1075,8 @@
       CHARACTER*80       LINE
       INTEGER            I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
      $                   NK, NN, NPARMS, NRHS, NTYPES,
-     $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS
+     $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH
+      INTEGER*4  N_THREADS
       REAL               EPS, S1, S2, THRESH, THRSHN
 *     ..
 *     .. Local Arrays ..

From 2b9443b7e78aa4b5f77e5d4d4cb03205bcdd52fc Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 14 Jul 2021 22:40:29 +0200
Subject: [PATCH 002/143] Declare N_THREADS as *4 for compatibility of
 INTERFACE64 builds with LLVM libomp

---
 lapack-netlib/TESTING/EIG/dchkee.F | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/EIG/dchkee.F b/lapack-netlib/TESTING/EIG/dchkee.F
index 00e8eb57f..6399fecef 100644
--- a/lapack-netlib/TESTING/EIG/dchkee.F
+++ b/lapack-netlib/TESTING/EIG/dchkee.F
@@ -1081,7 +1081,8 @@
       CHARACTER*80       LINE
       INTEGER            I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
      $                   NK, NN, NPARMS, NRHS, NTYPES,
-     $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS
+     $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH
+      INTEGER*4 N_THREADS
       DOUBLE PRECISION   EPS, S1, S2, THRESH, THRSHN
 *     ..
 *     .. Local Arrays ..

From f4d4abd423ecf998faa70e09847fd99cdac8888a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 14 Jul 2021 22:41:45 +0200
Subject: [PATCH 003/143] Declare N_THREADS as *4 for compatibility of
 INTERFACE64 builds with LLVM libomp

---
 lapack-netlib/TESTING/EIG/schkee.F | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/EIG/schkee.F b/lapack-netlib/TESTING/EIG/schkee.F
index c3f9ca162..5484a7c26 100644
--- a/lapack-netlib/TESTING/EIG/schkee.F
+++ b/lapack-netlib/TESTING/EIG/schkee.F
@@ -1081,7 +1081,8 @@
       CHARACTER*80       LINE
       INTEGER            I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
      $                   NK, NN, NPARMS, NRHS, NTYPES,
-     $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS
+     $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH
+      INTEGER*4          N_THREADS
       REAL               EPS, S1, S2, THRESH, THRSHN
 *     ..
 *     .. Local Arrays ..

From f176ff90af6b1d16f940575ea2f03edc13e5f444 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 14 Jul 2021 22:42:43 +0200
Subject: [PATCH 004/143] Declare N_THREADS as *4 for compatibility of
 INTERFACE64 builds with LLVM libomp

---
 lapack-netlib/TESTING/EIG/zchkee.F | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/EIG/zchkee.F b/lapack-netlib/TESTING/EIG/zchkee.F
index 908b7d651..7e9144d15 100644
--- a/lapack-netlib/TESTING/EIG/zchkee.F
+++ b/lapack-netlib/TESTING/EIG/zchkee.F
@@ -1075,7 +1075,8 @@
       CHARACTER*80       LINE
       INTEGER            I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
      $                   NK, NN, NPARMS, NRHS, NTYPES,
-     $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS
+     $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH
+      INTEGER*4          N_THREADS
       DOUBLE PRECISION   EPS, S1, S2, THRESH, THRSHN
 *     ..
 *     .. Local Arrays ..

From 6e3fbe8ac5a405149ebd6acaad6a4c88d3e07215 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 15 Jul 2021 14:59:15 +0200
Subject: [PATCH 005/143] Update version to 0.3.17.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 37191a42b..0330b2ce7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 17)
+set(OpenBLAS_PATCH_VERSION 17.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions

From 38d5b4b1241f60ab533f136b2d8e61eef1f5062e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 15 Jul 2021 15:00:01 +0200
Subject: [PATCH 006/143] Update version to 0.3.17.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 2e0980fa9..7c04a3101 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.17
+VERSION = 0.3.17.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From 49bbf330ca592f439a07f24f137e61af1cc9c616 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Jul 2021 22:19:19 +0200
Subject: [PATCH 007/143] Empirical workaround for numpy SVD NaN problem from
 issue 3318

---
 kernel/Makefile.L2 | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2
index 888a9b959..ac53c29c3 100644
--- a/kernel/Makefile.L2
+++ b/kernel/Makefile.L2
@@ -1,3 +1,10 @@
+FMAFLAG=
+ifndef OLDGCC
+ifdef HAVE_FMA3
+FMAFLAG = -mfma
+endif
+endif
+
 ### GEMV ###
 
 ifndef SGEMVNKERNEL
@@ -263,7 +270,7 @@ $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX)  $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX)  : $(KER
 	$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX  -UTRANS $< -o $@
 
 $(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX)  $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
-	$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX  -DTRANS $< -o $@
+	$(CC) -c $(CFLAGS) $(FMAFLAG) -DDOUBLE -UCOMPLEX  -DTRANS $< -o $@
 endif
 
 $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX)  $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(QGEMVNKERNEL)

From 30f23be0f94c7041b7e3bb53a4a0236355cdabad Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Jul 2021 12:00:03 +0200
Subject: [PATCH 008/143] Rework setting of -mfma to only apply it where
 necessary

---
 cmake/cc.cmake     |  6 +++---
 cmake/system.cmake | 10 +++++-----
 cmake/utils.cmake  | 10 +++++++++-
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 76952152b..ac5e455d5 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -124,9 +124,9 @@ if (NOT DYNAMIC_ARCH)
 	if (HAVE_AVX)
         set (CCOMMON_OPT  "${CCOMMON_OPT} -mavx")
 	endif ()
-	if (HAVE_FMA3)
-	set (CCOMMON_OPT  "${CCOMMON_OPT} -mfma")
-	endif ()
+	#	if (HAVE_FMA3)
+	#set (CCOMMON_OPT  "${CCOMMON_OPT} -mfma")
+	#endif ()
 	if (HAVE_SSE)
 	set (CCOMMON_OPT  "${CCOMMON_OPT} -msse")
 	endif ()
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 34874827c..f8bd6678e 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -186,11 +186,11 @@ if (DEFINED TARGET)
       	  set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
 	endif()
   endif()
-  if (DEFINED HAVE_FMA3)
-	if (NOT NO_AVX2)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
-	endif()
-  endif()
+  #  if (DEFINED HAVE_FMA3)
+  #	if (NOT NO_AVX2)
+  #  set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
+  #	endif()
+  #  endif()
     if (DEFINED HAVE_SSE)
     set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
   endif()
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 794d73d06..2c1a1c763 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -311,7 +311,15 @@ function(GenerateNamedObjects sources_in)
       configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY)
       file(REMOVE ${new_source_file}.tmp)
       list(APPEND SRC_LIST_OUT ${new_source_file})
-
+      message (STATUS ${new_source_file})
+      if (DEFINED HAVE_FMA3)
+        if ( ${new_source_file} MATCHES "(s|d?)rot_k.c")
+		set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
+        endif ()
+        if ( ${new_source_file} MATCHES "dgemv_t_k.c")
+		set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
+        endif ()
+      endif ()
     endforeach ()
   endforeach ()
 

From 47ba85f314808476c8254779389607f9af60231f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Jul 2021 17:24:15 +0200
Subject: [PATCH 009/143] Fix regex to match kernels suffixed with cpuname too

---
 cmake/utils.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 2c1a1c763..6b54092ea 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -313,10 +313,10 @@ function(GenerateNamedObjects sources_in)
       list(APPEND SRC_LIST_OUT ${new_source_file})
       message (STATUS ${new_source_file})
       if (DEFINED HAVE_FMA3)
-        if ( ${new_source_file} MATCHES "(s|d?)rot_k.c")
+        if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c")
 		set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
         endif ()
-        if ( ${new_source_file} MATCHES "dgemv_t_k.c")
+        if ( ${new_source_file} MATCHES "dgemv_t_k.*c")
 		set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
         endif ()
       endif ()

From efbd7c7840f01f6479fb0224ff473c3166eee669 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 23 Jul 2021 13:42:52 +0200
Subject: [PATCH 010/143] GCC did not support -mtune for ARM64 before 5.1

---
 Makefile.arm64 | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/Makefile.arm64 b/Makefile.arm64
index c23a0876e..2656a17f9 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -1,4 +1,15 @@
 ifneq ($(C_COMPILER), PGI)
+
+ifneq ($(GCCVERSIONGT4), 1)
+CCOMMON_OPT += -march=armv8-a
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8-a
+endif
+
+
+else 
+
+
 ifeq ($(CORE), ARMV8)
 CCOMMON_OPT += -march=armv8-a
 ifneq ($(F_COMPILER), NAG)
@@ -138,4 +149,7 @@ FCOMMON_OPT += -march=armv8-a -mtune=emag
 endif
 endif
 endif
+
 endif
+
+endif
\ No newline at end of file

From af0a69f355a086d70cc08ccda8bde7a48b3133c4 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei@loongson.cn>
Date: Mon, 26 Jul 2021 15:44:54 +0800
Subject: [PATCH 011/143] Add support for LOONGARCH64

---
 Makefile.loongarch64                  |    3 +
 Makefile.system                       |   12 +
 TargetList.txt                        |    2 +
 c_check                               |   53 +-
 common.h                              |    6 +-
 common_loongarch64.h                  |  199 ++
 common_macro.h                        |    3 +-
 cpuid_loongarch64.c                   |  110 +
 ctest.c                               |    4 +
 getarch.c                             |   24 +-
 kernel/loongarch64/KERNEL             |  236 ++
 kernel/loongarch64/KERNEL.LOONGSON3R5 |    1 +
 kernel/loongarch64/KERNEL.generic     |  167 ++
 kernel/loongarch64/Makefile           |    1 +
 kernel/loongarch64/amax.S             |  230 ++
 kernel/loongarch64/amin.S             |  186 ++
 kernel/loongarch64/asum.S             |  232 ++
 kernel/loongarch64/cnrm2.S            |  159 ++
 kernel/loongarch64/copy.S             |  225 ++
 kernel/loongarch64/dnrm2.S            |  314 +++
 kernel/loongarch64/dot.S              |  391 ++++
 kernel/loongarch64/gemm_kernel.S      | 1859 ++++++++++++++++
 kernel/loongarch64/gemv_n.S           |  531 +++++
 kernel/loongarch64/gemv_t.S           |  436 ++++
 kernel/loongarch64/iamax.S            |  233 ++
 kernel/loongarch64/iamin.S            |  233 ++
 kernel/loongarch64/izamax.S           |  217 ++
 kernel/loongarch64/izamin.S           |  217 ++
 kernel/loongarch64/max.S              |  174 ++
 kernel/loongarch64/min.S              |  174 ++
 kernel/loongarch64/scal.S             |  330 +++
 kernel/loongarch64/snrm2.S            |  249 +++
 kernel/loongarch64/swap.S             |  330 +++
 kernel/loongarch64/trsm_kernel_LN.S   | 2863 +++++++++++++++++++++++++
 kernel/loongarch64/trsm_kernel_LT.S   | 2854 ++++++++++++++++++++++++
 kernel/loongarch64/trsm_kernel_RT.S   | 2850 ++++++++++++++++++++++++
 kernel/loongarch64/zamax.S            |  190 ++
 kernel/loongarch64/zamin.S            |  198 ++
 kernel/loongarch64/zasum.S            |  158 ++
 kernel/loongarch64/zcopy.S            |  217 ++
 kernel/loongarch64/zdot.S             |  330 +++
 kernel/loongarch64/zgemm3m_kernel.S   | 1359 ++++++++++++
 kernel/loongarch64/zgemm_kernel.S     | 1047 +++++++++
 kernel/loongarch64/zgemv_n.S          |  648 ++++++
 kernel/loongarch64/zgemv_t.S          |  556 +++++
 kernel/loongarch64/znrm2.S            |  304 +++
 kernel/loongarch64/zscal.S            |  356 +++
 kernel/loongarch64/ztrsm_kernel_LT.S  | 1344 ++++++++++++
 kernel/loongarch64/ztrsm_kernel_RT.S  | 1343 ++++++++++++
 lapack/laswp/loongarch64/Makefile     |   12 +
 param.h                               |   46 +
 51 files changed, 24189 insertions(+), 27 deletions(-)
 create mode 100644 Makefile.loongarch64
 create mode 100644 common_loongarch64.h
 create mode 100644 cpuid_loongarch64.c
 create mode 100644 kernel/loongarch64/KERNEL
 create mode 100644 kernel/loongarch64/KERNEL.LOONGSON3R5
 create mode 100644 kernel/loongarch64/KERNEL.generic
 create mode 100644 kernel/loongarch64/Makefile
 create mode 100644 kernel/loongarch64/amax.S
 create mode 100644 kernel/loongarch64/amin.S
 create mode 100644 kernel/loongarch64/asum.S
 create mode 100644 kernel/loongarch64/cnrm2.S
 create mode 100644 kernel/loongarch64/copy.S
 create mode 100644 kernel/loongarch64/dnrm2.S
 create mode 100644 kernel/loongarch64/dot.S
 create mode 100644 kernel/loongarch64/gemm_kernel.S
 create mode 100644 kernel/loongarch64/gemv_n.S
 create mode 100644 kernel/loongarch64/gemv_t.S
 create mode 100644 kernel/loongarch64/iamax.S
 create mode 100644 kernel/loongarch64/iamin.S
 create mode 100644 kernel/loongarch64/izamax.S
 create mode 100644 kernel/loongarch64/izamin.S
 create mode 100644 kernel/loongarch64/max.S
 create mode 100644 kernel/loongarch64/min.S
 create mode 100644 kernel/loongarch64/scal.S
 create mode 100644 kernel/loongarch64/snrm2.S
 create mode 100644 kernel/loongarch64/swap.S
 create mode 100644 kernel/loongarch64/trsm_kernel_LN.S
 create mode 100644 kernel/loongarch64/trsm_kernel_LT.S
 create mode 100644 kernel/loongarch64/trsm_kernel_RT.S
 create mode 100644 kernel/loongarch64/zamax.S
 create mode 100644 kernel/loongarch64/zamin.S
 create mode 100644 kernel/loongarch64/zasum.S
 create mode 100644 kernel/loongarch64/zcopy.S
 create mode 100644 kernel/loongarch64/zdot.S
 create mode 100644 kernel/loongarch64/zgemm3m_kernel.S
 create mode 100644 kernel/loongarch64/zgemm_kernel.S
 create mode 100644 kernel/loongarch64/zgemv_n.S
 create mode 100644 kernel/loongarch64/zgemv_t.S
 create mode 100644 kernel/loongarch64/znrm2.S
 create mode 100644 kernel/loongarch64/zscal.S
 create mode 100644 kernel/loongarch64/ztrsm_kernel_LT.S
 create mode 100644 kernel/loongarch64/ztrsm_kernel_RT.S
 create mode 100644 lapack/laswp/loongarch64/Makefile

diff --git a/Makefile.loongarch64 b/Makefile.loongarch64
new file mode 100644
index 000000000..05ea9c679
--- /dev/null
+++ b/Makefile.loongarch64
@@ -0,0 +1,3 @@
+ifdef BINARY64
+else
+endif
diff --git a/Makefile.system b/Makefile.system
index bb8c60e91..4084390db 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -780,6 +780,11 @@ NO_BINARY_MODE  = 1
 BINARY_DEFINED  = 1
 endif
 
+ifeq ($(ARCH), loongarch64)
+NO_BINARY_MODE  = 1
+BINARY_DEFINED  = 1
+endif
+
 
 #
 #  C Compiler dependent settings
@@ -850,6 +855,13 @@ ifeq ($(OSNAME), AIX)
 BINARY_DEFINED = 1
 endif
 
+ifeq ($(ARCH), loongarch64)
+ifeq ($(CORE), LOONGSONG3R5)
+CCOMMON_OPT += -march=loongarch64 -mabi=lp64
+FCOMMON_OPT += -march=loongarch64 -mabi=lp64
+endif
+endif
+
 endif
 
 ifndef BINARY_DEFINED
diff --git a/TargetList.txt b/TargetList.txt
index f93a629d8..963545cdd 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -110,3 +110,5 @@ Z14
 RISCV64_GENERIC
 C910V
 
+11.LOONGARCH64:
+LOONGSON3R5
diff --git a/c_check b/c_check
index e24943a29..030f5e632 100644
--- a/c_check
+++ b/c_check
@@ -82,18 +82,19 @@ $os = Interix         if ($data =~ /OS_INTERIX/);
 $os = Android         if ($data =~ /OS_ANDROID/);
 $os = Haiku           if ($data =~ /OS_HAIKU/);
 
-$architecture = x86    if ($data =~ /ARCH_X86/);
-$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
-$architecture = power  if ($data =~ /ARCH_POWER/);
-$architecture = mips   if ($data =~ /ARCH_MIPS/);
-$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
-$architecture = alpha  if ($data =~ /ARCH_ALPHA/);
-$architecture = sparc  if ($data =~ /ARCH_SPARC/);
-$architecture = ia64   if ($data =~ /ARCH_IA64/);
-$architecture = arm    if ($data =~ /ARCH_ARM/);
-$architecture = arm64  if ($data =~ /ARCH_ARM64/);
-$architecture = zarch  if ($data =~ /ARCH_ZARCH/);
-$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
+$architecture = x86          if ($data =~ /ARCH_X86/);
+$architecture = x86_64       if ($data =~ /ARCH_X86_64/);
+$architecture = power        if ($data =~ /ARCH_POWER/);
+$architecture = mips         if ($data =~ /ARCH_MIPS/);
+$architecture = mips64       if ($data =~ /ARCH_MIPS64/);
+$architecture = alpha        if ($data =~ /ARCH_ALPHA/);
+$architecture = sparc        if ($data =~ /ARCH_SPARC/);
+$architecture = ia64         if ($data =~ /ARCH_IA64/);
+$architecture = arm          if ($data =~ /ARCH_ARM/);
+$architecture = arm64        if ($data =~ /ARCH_ARM64/);
+$architecture = zarch        if ($data =~ /ARCH_ZARCH/);
+$architecture = riscv64      if ($data =~ /ARCH_RISCV64/);
+$architecture = loongarch64  if ($data =~ /ARCH_LOONGARCH64/);
 
 $defined = 0;
 
@@ -143,6 +144,11 @@ if ($architecture eq "riscv64") {
     $binary = 64;
 }
 
+if ($architecture eq "loongarch64") {
+    $defined = 1;
+    $binary = 64;
+}
+
 if ($compiler eq "PGI") {
     $compiler_name .= " -tp p7"    if ($binary eq "32");
     $compiler_name .= " -tp p7-64" if ($binary eq "64");
@@ -215,17 +221,18 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
     }
 }
 
-$architecture = x86    if ($data =~ /ARCH_X86/);
-$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
-$architecture = power  if ($data =~ /ARCH_POWER/);
-$architecture = mips   if ($data =~ /ARCH_MIPS/);
-$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
-$architecture = alpha  if ($data =~ /ARCH_ALPHA/);
-$architecture = sparc  if ($data =~ /ARCH_SPARC/);
-$architecture = ia64   if ($data =~ /ARCH_IA64/);
-$architecture = arm    if ($data =~ /ARCH_ARM/);
-$architecture = arm64  if ($data =~ /ARCH_ARM64/);
-$architecture = zarch  if ($data =~ /ARCH_ZARCH/);
+$architecture = x86          if ($data =~ /ARCH_X86/);
+$architecture = x86_64       if ($data =~ /ARCH_X86_64/);
+$architecture = power        if ($data =~ /ARCH_POWER/);
+$architecture = mips         if ($data =~ /ARCH_MIPS/);
+$architecture = mips64       if ($data =~ /ARCH_MIPS64/);
+$architecture = alpha        if ($data =~ /ARCH_ALPHA/);
+$architecture = sparc        if ($data =~ /ARCH_SPARC/);
+$architecture = ia64         if ($data =~ /ARCH_IA64/);
+$architecture = arm          if ($data =~ /ARCH_ARM/);
+$architecture = arm64        if ($data =~ /ARCH_ARM64/);
+$architecture = zarch        if ($data =~ /ARCH_ZARCH/);
+$architecture = loongarch64  if ($data =~ /ARCH_LOONGARCH64/);
 
 $binformat    = bin32;
 $binformat    = bin64  if ($data =~ /BINARY_64/);
diff --git a/common.h b/common.h
index ac795937c..ff5254a5c 100644
--- a/common.h
+++ b/common.h
@@ -449,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_mips.h"
 #endif
 
-    
+
 #ifdef ARCH_RISCV64
 #include "common_riscv64.h"
 #endif
@@ -470,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_zarch.h"
 #endif
 
+#ifdef ARCH_LOONGARCH64
+#include "common_loongarch64.h"
+#endif
+
 #ifndef ASSEMBLER
 #ifdef OS_WINDOWSSTORE
 typedef char env_var_t[MAX_PATH];
diff --git a/common_loongarch64.h b/common_loongarch64.h
new file mode 100644
index 000000000..959e7e58a
--- /dev/null
+++ b/common_loongarch64.h
@@ -0,0 +1,199 @@
+/*****************************************************************************
+Copyright (c) 2011-2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#ifndef COMMON_LOONGARCH64
+#define COMMON_LOONGARCH64
+
+#define MB  __sync_synchronize()
+#define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
+
+#define INLINE inline
+
+#ifndef ASSEMBLER
+
+static inline int blas_quickdivide(blasint x, blasint y){
+  return x / y;
+}
+
+#ifdef DOUBLE
+#define GET_IMAGE(res)  __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res)  : : "memory")
+#else
+#define GET_IMAGE(res)  __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res)  : : "memory")
+#endif
+
+#define GET_IMAGE_CANCEL
+
+#else
+
+#ifdef DOUBLE
+#define LD      fld.d
+#define ST      fst.d
+#define MADD    fmadd.d
+#define NMADD   fnmadd.d
+#define MSUB    fmsub.d
+#define NMSUB   fnmsub.d
+#define ADD     fadd.d
+#define SUB     fsub.d
+#define MUL     fmul.d
+#define MOV     fmov.d
+#define CMOVT   fsel
+#define MTC     movgr2fr.d
+#define FABS    fabs.d
+#define CMPEQ   fcmp.ceq.d
+#define CMPLE   fcmp.cle.d
+#define CMPLT   fcmp.clt.d
+#define NEG     fneg.d
+#else
+#define LD      fld.s
+#define ST      fst.s
+#define MADD    fmadd.s
+#define NMADD   fnmadd.s
+#define MSUB    fmsub.s
+#define NMSUB   fnmsub.s
+#define ADD     fadd.s
+#define SUB     fsub.s
+#define MUL     fmul.s
+#define MOV     fmov.s
+#define CMOVT   fsel
+#define MTC     movgr2fr.w
+#define FABS    fabs.s
+#define CMPEQ   fcmp.ceq.s
+#define CMPLE   fcmp.cle.s
+#define CMPLT   fcmp.clt.s
+#define NEG     fneg.s
+#endif /* defined(DOUBLE) */
+
+#if defined(__64BIT__) && defined(USE64BITINT)
+#define LDINT   ld.d
+#define LDARG   ld.d
+#define SDARG   st.d
+#elif defined(__64BIT__) && !defined(USE64BITINT)
+#define LDINT   ld.w
+#define LDARG   ld.d
+#define SDARG   st.d
+#else
+#define LDINT   ld.w
+#define LDARG   ld.w
+#define SDARG   st.w
+#endif
+
+
+#ifndef F_INTERFACE
+#define REALNAME ASMNAME
+#else
+#define REALNAME ASMFNAME
+#endif /* defined(F_INTERFACE) */
+
+#if defined(ASSEMBLER) && !defined(NEEDPARAM)
+
+#define PROLOGUE \
+    .text ;\
+    .align 5 ;\
+    .globl  REALNAME ;\
+    .type   REALNAME, @function ;\
+REALNAME: ;\
+
+#if defined(__linux__) && defined(__ELF__)
+#define GNUSTACK .section .note.GNU-stack,"",@progbits
+#else
+#define GNUSTACK
+#endif /* defined(__linux__) && defined(__ELF__) */
+
+#define EPILOGUE      \
+    .end    REALNAME ;\
+    GNUSTACK
+
+#define PROFCODE
+
+#define MOVT(dst, src, cc)  \
+    bceqz  cc,   1f;        \
+    add.d dst,  src,  $r0;  \
+    1:
+
+#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */
+
+#endif /* defined(ASSEMBLER) */
+
+#define SEEK_ADDRESS
+
+#define BUFFER_SIZE     ( 32 << 20)
+
+#define PAGESIZE        (16UL << 1)
+#define FIXED_PAGESIZE  (16UL << 10)
+#define HUGE_PAGESIZE   ( 2 << 20)
+
+#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+#endif
diff --git a/common_macro.h b/common_macro.h
index c6ea1bfd9..0136f18ab 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -2490,7 +2490,8 @@
 #endif
 
 #ifndef ASSEMBLER
-#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
+#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
+|| defined(ARCH_LOONGARCH64)
 extern BLASLONG gemm_offset_a;
 extern BLASLONG gemm_offset_b;
 extern BLASLONG sbgemm_p;
diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c
new file mode 100644
index 000000000..79b186bf1
--- /dev/null
+++ b/cpuid_loongarch64.c
@@ -0,0 +1,110 @@
+/*****************************************************************************
+Copyright (c) 2011-2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include <stdint.h>
+
+#define CPU_UNKNOWN     0
+#define CPU_LOONGSON3R5 1
+
+#define LOONGARCH_CFG2  0x02
+#define LOONGARCH_LASX  1<<7
+
+static char *cpuname[] = {
+  "UNKNOWN",
+  "LOONGSON3R5"
+};
+
+int detect(void) {
+    uint32_t reg = 0;
+
+    __asm__ volatile (
+        "cpucfg %0, %1 \n\t"
+        : "+&r"(reg)
+        : "r"(LOONGARCH_CFG2)
+    );
+
+    if (reg & LOONGARCH_LASX)
+        return CPU_LOONGSON3R5;
+    else
+        return CPU_UNKNOWN;
+}
+
+char *get_corename(void) {
+  return cpuname[detect()];
+}
+
+void get_architecture(void) {
+  printf("LOONGARCH64");
+}
+
+void get_subarchitecture(void) {
+  if (detect() == CPU_LOONGSON3R5) {
+    printf("LOONGSON3R5");
+  } else {
+    printf("UNKNOWN");
+  }
+}
+
+void get_subdirname(void) {
+  printf("loongarch64");
+}
+
+void get_cpuconfig(void) {
+  if (detect() == CPU_LOONGSON3R5) {
+    printf("#define LOONGSON3R5\n");
+    printf("#define L1_DATA_SIZE 65536\n");
+    printf("#define L1_DATA_LINESIZE 64\n");
+    printf("#define L2_SIZE 1048576\n");
+    printf("#define L2_LINESIZE 64\n");
+    printf("#define DTB_DEFAULT_ENTRIES 64\n");
+    printf("#define DTB_SIZE 4096\n");
+    printf("#define L2_ASSOCIATIVE 16\n");
+  } else {
+    printf("#define LOONGSON3R5\n");
+    printf("#define L1_DATA_SIZE 65536\n");
+    printf("#define L1_DATA_LINESIZE 64\n");
+    printf("#define L2_SIZE 1048576\n");
+    printf("#define L2_LINESIZE 64\n");
+    printf("#define DTB_DEFAULT_ENTRIES 64\n");
+    printf("#define DTB_SIZE 4096\n");
+    printf("#define L2_ASSOCIATIVE 16\n");
+  }
+}
+
+void get_libname(void){
+  if (detect() == CPU_LOONGSON3R5) {
+    printf("loongson3r5\n");
+  } else {
+    printf("loongarch64\n");
+  }
+}
diff --git a/ctest.c b/ctest.c
index d674a8cbd..4f18918f5 100644
--- a/ctest.c
+++ b/ctest.c
@@ -157,6 +157,10 @@ ARCH_ARM64
 ARCH_RISCV64
 #endif
 
+#ifdef __loongarch64
+ARCH_LOONGARCH64
+#endif
+
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
 HAVE_C11
 #endif
diff --git a/getarch.c b/getarch.c
index 3bc8a0c3d..6e43616f7 100644
--- a/getarch.c
+++ b/getarch.c
@@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* #define FORCE_SICORTEX	*/
 /* #define FORCE_LOONGSON3R3	*/
 /* #define FORCE_LOONGSON3R4	*/
+/* #define FORCE_LOONGSON3R5	*/
 /* #define FORCE_I6400		*/
 /* #define FORCE_P6600		*/
 /* #define FORCE_P5600		*/
@@ -842,6 +843,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
+#ifdef FORCE_LOONGSON3R5
+#define FORCE
+#define ARCHITECTURE    "LOONGARCH"
+#define SUBARCHITECTURE "LOONGSON3R5"
+#define SUBDIRNAME      "loongarch64"
+#define ARCHCONFIG   "-DLOONGSON3R5 " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
+#define LIBNAME   "loongson3r5"
+#define CORENAME  "LOONGSON3R5"
+#else
+#endif
+
 #ifdef FORCE_I6400
 #define FORCE
 #define ARCHITECTURE    "MIPS"
@@ -1388,6 +1403,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
+#ifdef __loongarch64
+#include "cpuid_loongarch64.c"
+#define OPENBLAS_SUPPORTED
+#endif
+
 #ifdef __riscv
 #include "cpuid_riscv64.c"
 #define OPENBLAS_SUPPORTED
@@ -1463,7 +1483,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
     printf("CORE=%s\n", CORENAME);
 #else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
     printf("CORE=%s\n", get_corename());
 #endif
 #endif
@@ -1611,7 +1631,7 @@ printf("ELF_VERSION=2\n");
 #ifdef FORCE
     printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
 #else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
     printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
 #endif
 #endif
diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL
new file mode 100644
index 000000000..e96a90e72
--- /dev/null
+++ b/kernel/loongarch64/KERNEL
@@ -0,0 +1,236 @@
+ifndef SAXPYKERNEL
+SAXPYKERNEL = ../arm/axpy.c
+endif
+
+ifndef DAXPYKERNEL
+DAXPYKERNEL = ../arm/axpy.c
+endif
+
+ifndef CAXPYKERNEL
+CAXPYKERNEL = ../arm/zaxpy.c
+endif
+
+ifndef ZAXPYKERNEL
+ZAXPYKERNEL = ../arm/zaxpy.c
+endif
+
+ifndef SROTKERNEL
+SROTKERNEL  = ../arm/rot.c
+endif
+
+ifndef DROTKERNEL
+DROTKERNEL  = ../arm/rot.c
+endif
+
+ifndef CROTKERNEL
+CROTKERNEL  = ../arm/zrot.c
+endif
+
+ifndef ZROTKERNEL
+ZROTKERNEL  = ../arm/zrot.c
+endif
+
+ifndef CSWAPKERNEL
+CSWAPKERNEL = ../arm/zswap.c
+endif
+
+ifndef ZSWAPKERNEL
+ZSWAPKERNEL = ../arm/zswap.c
+endif
+
+ifndef SSUMKERNEL
+SSUMKERNEL  = ../arm/sum.c
+endif
+
+ifndef DSUMKERNEL
+DSUMKERNEL  = ../arm/sum.c
+endif
+
+ifndef CSUMKERNEL
+CSUMKERNEL  = ../arm/zsum.c
+endif
+
+ifndef ZSUMKERNEL
+ZSUMKERNEL  = ../arm/zsum.c
+endif
+
+ifndef ISMAXKERNEL
+ISMAXKERNEL = ../arm/imax.c
+endif
+
+ifndef IDMAXKERNEL
+IDMAXKERNEL = ../arm/imax.c
+endif
+
+ifndef ISMINKERNEL
+ISMINKERNEL = ../arm/imin.c
+endif
+
+ifndef IDMINKERNEL
+IDMINKERNEL = ../arm/imin.c
+endif
+
+ifndef SNRM2KERNEL
+SNRM2KERNEL = snrm2.S
+endif
+
+ifndef DNRM2KERNEL
+DNRM2KERNEL = dnrm2.S
+endif
+
+ifndef CNRM2KERNEL
+CNRM2KERNEL = cnrm2.S
+endif
+
+ifndef ZNRM2KERNEL
+ZNRM2KERNEL = znrm2.S
+endif
+
+ifndef SCABS_KERNEL
+SCABS_KERNEL   = ../generic/cabs.c
+endif
+
+ifndef DCABS_KERNEL
+DCABS_KERNEL   = ../generic/cabs.c
+endif
+
+ifndef QCABS_KERNEL
+QCABS_KERNEL   = ../generic/cabs.c
+endif
+
+ifndef LSAME_KERNEL
+LSAME_KERNEL   = ../generic/lsame.c
+endif
+
+ifndef SGEMMKERNEL
+SGEMMKERNEL    =  gemm_kernel.S
+SGEMMINCOPY    = ../generic/gemm_ncopy_2.c
+SGEMMITCOPY    = ../generic/gemm_tcopy_2.c
+SGEMMONCOPY    = ../generic/gemm_ncopy_8.c
+SGEMMOTCOPY    = ../generic/gemm_tcopy_8.c
+SGEMMINCOPYOBJ =  sgemm_incopy.o
+SGEMMITCOPYOBJ =  sgemm_itcopy.o
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+endif
+
+ifndef DGEMMKERNEL
+DGEMMKERNEL    =  gemm_kernel.S
+DGEMMINCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMITCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_8.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_8.c
+DGEMMINCOPYOBJ =  dgemm_incopy.o
+DGEMMITCOPYOBJ =  dgemm_itcopy.o
+DGEMMONCOPYOBJ =  dgemm_oncopy.o
+DGEMMOTCOPYOBJ =  dgemm_otcopy.o
+endif
+
+ifndef CGEMMKERNEL
+CGEMMKERNEL    =  zgemm_kernel.S
+CGEMMINCOPY    = ../generic/zgemm_ncopy_1.c
+CGEMMITCOPY    = ../generic/zgemm_tcopy_1.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
+CGEMMINCOPYOBJ =  cgemm_incopy.o
+CGEMMITCOPYOBJ =  cgemm_itcopy.o
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+endif
+
+ifndef ZGEMMKERNEL
+ZGEMMKERNEL    =  zgemm_kernel.S
+ZGEMMINCOPY    = ../generic/zgemm_ncopy_1.c
+ZGEMMITCOPY    = ../generic/zgemm_tcopy_1.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
+ZGEMMINCOPYOBJ =  zgemm_incopy.o
+ZGEMMITCOPYOBJ =  zgemm_itcopy.o
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+endif
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
+
+ifndef STRSMKERNEL_LN
+STRSMKERNEL_LN =  trsm_kernel_LN.S
+endif
+
+ifndef STRSMKERNEL_LT
+STRSMKERNEL_LT =  trsm_kernel_LT.S
+endif
+
+ifndef STRSMKERNEL_RN
+STRSMKERNEL_RN =  trsm_kernel_LT.S
+endif
+
+ifndef STRSMKERNEL_RT
+STRSMKERNEL_RT =  trsm_kernel_RT.S
+endif
+
+ifndef DTRSMKERNEL_LN
+DTRSMKERNEL_LN =  trsm_kernel_LN.S
+endif
+
+ifndef DTRSMKERNEL_LT
+DTRSMKERNEL_LT =  trsm_kernel_LT.S
+endif
+
+ifndef DTRSMKERNEL_RN
+DTRSMKERNEL_RN =  trsm_kernel_LT.S
+endif
+
+ifndef DTRSMKERNEL_RT
+DTRSMKERNEL_RT =  trsm_kernel_RT.S
+endif
+
+ifndef CTRSMKERNEL_LN
+CTRSMKERNEL_LN =  ztrsm_kernel_LT.S
+endif
+
+ifndef CTRSMKERNEL_LT
+CTRSMKERNEL_LT =  ztrsm_kernel_LT.S
+endif
+
+ifndef CTRSMKERNEL_RN
+CTRSMKERNEL_RN =  ztrsm_kernel_LT.S
+endif
+
+ifndef CTRSMKERNEL_RT
+CTRSMKERNEL_RT =  ztrsm_kernel_RT.S
+endif
+
+ifndef ZTRSMKERNEL_LN
+ZTRSMKERNEL_LN =  ztrsm_kernel_LT.S
+endif
+
+ifndef ZTRSMKERNEL_LT
+ZTRSMKERNEL_LT =  ztrsm_kernel_LT.S
+endif
+
+ifndef ZTRSMKERNEL_RN
+ZTRSMKERNEL_RN =  ztrsm_kernel_LT.S
+endif
+
+ifndef ZTRSMKERNEL_RT
+ZTRSMKERNEL_RT =  ztrsm_kernel_RT.S
+endif
+
+ifndef CGEMM3MKERNEL
+CGEMM3MKERNEL    =  zgemm3m_kernel.S
+endif
+
+ifndef ZGEMM3MKERNEL
+ZGEMM3MKERNEL    =  zgemm3m_kernel.S
+endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
new file mode 100644
index 000000000..cce4093e3
--- /dev/null
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -0,0 +1 @@
+#TODO: Add loongarch64 SIMD optimizations
diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic
new file mode 100644
index 000000000..105b2f6fd
--- /dev/null
+++ b/kernel/loongarch64/KERNEL.generic
@@ -0,0 +1,167 @@
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+STRMMKERNEL    = ../generic/trmmkernel_2x2.c
+DTRMMKERNEL    = ../generic/trmmkernel_2x2.c
+CTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN  =  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT  =  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN  =  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT  =  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+#Pure C for other kernels
+SAMAXKERNEL  = ../arm/amax.c
+DAMAXKERNEL  = ../arm/amax.c
+CAMAXKERNEL  = ../arm/zamax.c
+ZAMAXKERNEL  = ../arm/zamax.c
+
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMAXKERNEL = ../arm/iamax.c
+IDAMAXKERNEL = ../arm/iamax.c
+ICAMAXKERNEL = ../arm/izamax.c
+IZAMAXKERNEL = ../arm/izamax.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+SASUMKERNEL  = ../arm/asum.c
+DASUMKERNEL  = ../arm/asum.c
+CASUMKERNEL  = ../arm/zasum.c
+ZASUMKERNEL  = ../arm/zasum.c
+
+SSUMKERNEL   = ../arm/sum.c
+DSUMKERNEL   = ../arm/sum.c
+CSUMKERNEL   = ../arm/zsum.c
+ZSUMKERNEL   = ../arm/zsum.c
+
+
+SAXPYKERNEL  = ../arm/axpy.c
+DAXPYKERNEL  = ../arm/axpy.c
+CAXPYKERNEL  = ../arm/zaxpy.c
+ZAXPYKERNEL  = ../arm/zaxpy.c
+
+SCOPYKERNEL  = ../arm/copy.c
+DCOPYKERNEL  = ../arm/copy.c
+CCOPYKERNEL  = ../arm/zcopy.c
+ZCOPYKERNEL  = ../arm/zcopy.c
+
+SDOTKERNEL   = ../generic/dot.c
+DDOTKERNEL   = ../arm/dot.c
+CDOTKERNEL   = ../arm/zdot.c
+ZDOTKERNEL   = ../arm/zdot.c
+
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+
+SROTKERNEL   = ../arm/rot.c
+DROTKERNEL   = ../arm/rot.c
+CROTKERNEL   = ../arm/zrot.c
+ZROTKERNEL   = ../arm/zrot.c
+
+SSCALKERNEL  = ../arm/scal.c
+DSCALKERNEL  = ../arm/scal.c
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+
+SSWAPKERNEL  = ../arm/swap.c
+DSWAPKERNEL  = ../arm/swap.c
+CSWAPKERNEL  = ../arm/zswap.c
+ZSWAPKERNEL  = ../arm/zswap.c
+
+SGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = ../arm/gemv_n.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+ZGEMVNKERNEL = ../arm/zgemv_n.c
+
+SGEMVTKERNEL = ../arm/gemv_t.c
+DGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
+ZGEMVTKERNEL = ../arm/zgemv_t.c
+
+SSYMV_U_KERNEL =  ../generic/symv_k.c
+SSYMV_L_KERNEL =  ../generic/symv_k.c
+DSYMV_U_KERNEL =  ../generic/symv_k.c
+DSYMV_L_KERNEL =  ../generic/symv_k.c
+QSYMV_U_KERNEL =  ../generic/symv_k.c
+QSYMV_L_KERNEL =  ../generic/symv_k.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+XSYMV_U_KERNEL =  ../generic/zsymv_k.c
+XSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+ZHEMV_U_KERNEL =  ../generic/zhemv_k.c
+ZHEMV_L_KERNEL =  ../generic/zhemv_k.c
+
+LSAME_KERNEL   = ../generic/lsame.c
+SCABS_KERNEL   = ../generic/cabs.c
+DCABS_KERNEL   = ../generic/cabs.c
+QCABS_KERNEL   = ../generic/cabs.c
+
+#Dump kernel
+CGEMM3MKERNEL  = ../generic/zgemm3mkernel_dump.c
+ZGEMM3MKERNEL  = ../generic/zgemm3mkernel_dump.c
diff --git a/kernel/loongarch64/Makefile b/kernel/loongarch64/Makefile
new file mode 100644
index 000000000..520349bd6
--- /dev/null
+++ b/kernel/loongarch64/Makefile
@@ -0,0 +1 @@
+clean ::
diff --git a/kernel/loongarch64/amax.S b/kernel/loongarch64/amax.S
new file mode 100644
index 000000000..4b135c522
--- /dev/null
+++ b/kernel/loongarch64/amax.S
@@ -0,0 +1,230 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+
+#define I      $r17
+#define TEMP   $r18
+
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+
+#define t1 $f0
+#define t2 $f1
+#define t3 $f2
+#define t4 $f3
+
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+
+   add.d   X, X, INCX
+   FABS    s1, a1
+
+   FABS   s2, a1
+   bge $r0,    N, .L999
+
+   FABS    s3, a1
+   srai.d  I, N, 3
+
+   FABS   s4, a1
+   bge $r0,    I, .L15
+
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+
+   FABS    t3, a3
+   LD a2,  X,   0 * SIZE
+   FABS    t4, a4
+   add.d   X, X, INCX
+
+   CMPLT   $fcc0, s1, t1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   add.d   X, X, INCX
+
+   CMPLT   $fcc2, s3, t3
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+
+   FABS    t1, a5
+   LD a5,  X,   0 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+
+   FABS    t3, a7
+   LD a6,  X,   0 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+
+   CMPLT   $fcc0, s1, t1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   add.d   X, X, INCX
+
+   CMPLT   $fcc2, s3, t3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+
+   CMOVT  s1,  s1,  t1,  $fcc0
+   addi.d  I, I, -1
+
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+
+   CMOVT  s4,  s4,  t4,  $fcc3
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+
+   FABS    t1, a5
+   FABS    t2, a6
+   FABS    t3, a7
+   FABS    t4, a8
+
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+
+   FABS    t1, a1
+
+   CMPLT   $fcc0, s1, t1
+
+   CMOVT  s1,  s1,  t1,  $fcc0
+
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   .align 3
+
+.L999:
+  move $r4, $r17
+  fmov.d $f0, $f22
+  jirl    $r0, $r1, 0x0
+
+  EPILOGUE
diff --git a/kernel/loongarch64/amin.S b/kernel/loongarch64/amin.S
new file mode 100644
index 000000000..ff9978f26
--- /dev/null
+++ b/kernel/loongarch64/amin.S
@@ -0,0 +1,186 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+
+   PROLOGUE
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   add.d   X, X, INCX
+   FABS    s1, a1
+   FABS    s2, a1
+   bge     $r0,    N, .L999
+   FABS    s3, a1
+   srai.d  I, N, 3
+   FABS   s4, a1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   FABS    t3, a3
+   LD a2,  X,   0 * SIZE
+   FABS    t4, a4
+   add.d   X, X, INCX
+   CMPLT   $fcc0, t1, s1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, t2, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, t3, s3
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, t4, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   FABS    t1, a5
+   LD a5,  X,   0 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+   FABS    t3, a7
+   LD a6,  X,   0 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   CMPLT   $fcc0, t1, s1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, t2, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, t3, s3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, t4, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   addi.d  I, I, -1
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   blt $r0,    I, .L12
+   .align 3
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   CMPLT   $fcc0, t1, s1
+   CMPLT   $fcc1, t2, s2
+   CMPLT   $fcc2, t3, s3
+   CMPLT   $fcc3, t4, s4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   FABS    t1, a5
+   FABS    t2, a6
+   FABS    t3, a7
+   FABS    t4, a8
+   CMPLT   $fcc0, t1, s1
+   CMPLT   $fcc1, t2, s2
+   CMPLT   $fcc2, t3, s3
+   CMPLT   $fcc3, t4, s4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   .align 3
+.L15:
+   andi    I,  N, 7
+NOP
+   bge $r0,    I, .L998
+   .align  3
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   CMPLT   $fcc0, t1, s1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+.L998:
+   CMPLT   $fcc0, s2, s1
+   CMPLT   $fcc1, s4, s3
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s3, s1
+   CMOVT  s1,  s1,  s3,  $fcc0
+   .align 3
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   EPILOGUE
diff --git a/kernel/loongarch64/asum.S b/kernel/loongarch64/asum.S
new file mode 100644
index 000000000..e4c717085
--- /dev/null
+++ b/kernel/loongarch64/asum.S
@@ -0,0 +1,232 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1 $f23
+#define a2 $f9
+#define a3 $f10
+#define a4 $f11
+#define a5 $f12
+#define a6 $f13
+#define a7 $f14
+#define a8 $f15
+#define t1 $f16
+#define t2 $f17
+#define t3 $f0
+#define t4 $f1
+#define s1 $f22
+#define s2 $f8
+   PROLOGUE
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+   MTC  s1, $r0
+   MTC  s2, $r0
+   slli.d  INCX, INCX, BASE_SHIFT
+   li TEMP, SIZE
+   bge $r0,    N, .L999
+   srai.d  I, N, 3
+   bne INCX, TEMP, .L20
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   FABS    t1, a1
+   LD a6,  X,   5 * SIZE
+   FABS    t2, a2
+   LD a7,  X,   6 * SIZE
+   FABS    t3, a3
+   FABS    t4, a4
+   addi.d  I, I, -1
+   LD a8,  X,   7 * SIZE
+   bge $r0,    I, .L13
+   .align 3
+.L12:
+   ADD s1, s1, t1
+   LD a1,  X,   8 * SIZE
+   FABS    t1, a5
+   addi.d  I, I, -1
+   ADD s2, s2, t2
+   LD a2,  X,   9 * SIZE
+   FABS    t2, a6
+   NOP
+   ADD s1, s1, t3
+   LD a3,  X,  10 * SIZE
+   FABS    t3, a7
+   NOP
+   ADD s2, s2, t4
+   LD a4,  X,  11 * SIZE
+   FABS    t4, a8
+   addi.d  X, X, 8 * SIZE
+   ADD s1, s1, t1
+   LD a5,  X,   4 * SIZE
+   FABS    t1, a1
+   NOP
+   ADD s2, s2, t2
+   LD a6,  X,   5 * SIZE
+   FABS    t2, a2
+   NOP
+   ADD s1, s1, t3
+   LD a7,  X,   6 * SIZE
+   FABS    t3, a3
+   NOP
+   ADD s2, s2, t4
+   LD a8,  X,   7 * SIZE
+   FABS   t4, a4
+   blt $r0,    I, .L12
+   .align 3
+.L13:
+   ADD s1, s1, t1
+   addi.d  X, X, 8 * SIZE
+   FABS    t1, a5
+   NOP
+   ADD s2, s2, t2
+   FABS    t2, a6
+   ADD s1, s1, t3
+   FABS    t3, a7
+   ADD s2, s2, t4
+   FABS    t4, a8
+   ADD s1, s1, t1
+   ADD s2, s2, t2
+   ADD s1, s1, t3
+   ADD s2, s2, t4
+   .align 3
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   ADD s1, s1, t1
+   addi.d X, X, SIZE
+   blt $r0,    I, .L16
+   b   .L999
+   .align 3
+.L20:
+   bge $r0,    I, .L25
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   LD a7,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   FABS    t3, a3
+   LD a8,  X,   0 * SIZE
+   FABS    t4, a4
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L24
+   .align 3
+.L23:
+   ADD s1, s1, t1
+   LD a1,  X,   0 * SIZE
+   FABS    t1, a5
+   add.d   X, X, INCX
+   ADD s2, s2, t2
+   LD a2,  X,   0 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+   ADD s1, s1, t3
+   LD a3,  X,   0 * SIZE
+   FABS    t3, a7
+   add.d   X, X, INCX
+   ADD s2, s2, t4
+   LD a4,  X,   0 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   ADD s1, s1, t1
+   LD a5,  X,   0 * SIZE
+   FABS    t1, a1
+   add.d   X, X, INCX
+   ADD s2, s2, t2
+   LD a6,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   ADD s1, s1, t3
+   LD a7,  X,   0 * SIZE
+   FABS    t3, a3
+   add.d   X, X, INCX
+   ADD s2, s2, t4
+   LD a8,  X,   0 * SIZE
+   FABS    t4, a4
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L23
+   .align 3
+.L24:
+   ADD s1, s1, t1
+   FABS    t1, a5
+   ADD s2, s2, t2
+   FABS    t2, a6
+   ADD s1, s1, t3
+   FABS    t3, a7
+   ADD s2, s2, t4
+   FABS    t4, a8
+   ADD s1, s1, t1
+   ADD s2, s2, t2
+   ADD s1, s1, t3
+   ADD s2, s2, t4
+   .align 3
+.L25:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L26:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   add.d   X, X, INCX
+   ADD    s1, s1, t1
+   blt $r0,    I, .L26
+   .align 3
+.L999:
+   ADD s1, s1, s2
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   EPILOGUE
diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S
new file mode 100644
index 000000000..c4b2555d3
--- /dev/null
+++ b/kernel/loongarch64/cnrm2.S
@@ -0,0 +1,159 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define a5     $f16
+#define a6     $f17
+#define a7     $f0
+#define a8     $f1
+#define s1     $f22
+#define s2     $f8
+#define t1     $f23
+#define t2     $f9
+#define t3     $f10
+#define t4     $f11
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   movgr2fr.d  s1,  $r0
+   li  TEMP, 2 * SIZE
+   fmov.d s2, s1
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    INCX, .L999
+   srai.d  I, N, 2
+   bge $r0,    I, .L25
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   fcvt.d.s    t1, a1
+   LD a7,  X,   0 * SIZE
+   fcvt.d.s    t2, a2
+   LD a8,  X,   1 * SIZE
+   fcvt.d.s    t3, a3
+   addi.d  I, I, -1
+   fcvt.d.s    t4, a4
+   add.d  X, X, INCX
+   bge $r0,    I, .L24
+   .align 3
+
+.L23:
+   fmadd.d  s1, t1, t1, s1
+   LD a1,  X,   0 * SIZE
+   fcvt.d.s    t1, a5
+   fmadd.d  s2, t2, t2, s2
+   LD a2,  X,   1 * SIZE
+   fcvt.d.s    t2, a6
+   add.d   X, X, INCX
+   fmadd.d  s1, t3, t3, s1
+   LD a3,  X,   0 * SIZE
+   fcvt.d.s    t3, a7
+   fmadd.d  s2, t4, t4, s2
+   LD a4,  X,   1 * SIZE
+   fcvt.d.s    t4, a8
+   add.d   X, X, INCX
+   fmadd.d  s1, t1, t1, s1
+   LD a5,  X,   0 * SIZE
+   fcvt.d.s    t1, a1
+   addi.d  I, I, -1
+   fmadd.d  s2, t2, t2, s2
+   LD a6,  X,   1 * SIZE
+   fcvt.d.s    t2, a2
+   add.d   X, X, INCX
+   fmadd.d  s1, t3, t3, s1
+   LD a7,  X,   0 * SIZE
+   fcvt.d.s    t3, a3
+   LD a8,  X,   1 * SIZE
+   fmadd.d  s2, t4, t4, s2
+   add.d   X, X, INCX
+   fcvt.d.s   t4, a4
+   blt $r0,    I, .L23
+   .align 3
+
+.L24:
+   fmadd.d  s1, t1, t1, s1
+   fcvt.d.s    t1, a5
+   fmadd.d  s2, t2, t2, s2
+   fcvt.d.s    t2, a6
+   fmadd.d  s1, t3, t3, s1
+   fcvt.d.s    t3, a7
+   fmadd.d  s2, t4, t4, s2
+   fcvt.d.s    t4, a8
+   fmadd.d  s1, t1, t1, s1
+   fmadd.d  s2, t2, t2, s2
+   fmadd.d  s1, t3, t3, s1
+   fmadd.d  s2, t4, t4, s2
+   .align 3
+
+.L25:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+
+.L26:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   addi.d  I, I, -1
+   fcvt.d.s    t1, a1
+   fcvt.d.s    t2, a2
+   fmadd.d  s1, t1, t1, s1
+   add.d   X, X, INCX
+   fmadd.d  s2, t2, t2, s2
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+   fadd.d  s1, s1, s2
+   fsqrt.d s1, s1
+   move $r4, $r17
+   fcvt.s.d    $f0, s1
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/copy.S b/kernel/loongarch64/copy.S
new file mode 100644
index 000000000..28b7bce4c
--- /dev/null
+++ b/kernel/loongarch64/copy.S
@@ -0,0 +1,225 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f22
+#define a2     $f8
+#define a3     $f23
+#define a4     $f9
+#define a5     $f10
+#define a6     $f11
+#define a7     $f12
+#define a8     $f13
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+   LDINT   INCY,  0(INCY)
+#endif
+
+   li  TEMP, SIZE
+   NOP
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    N, .L999
+   slli.d INCY, INCY, BASE_SHIFT
+   bne INCX, TEMP, .L20
+   srai.d I, N, 3
+   bne INCY, TEMP, .L20
+   addi.d I, I, -1
+   blt     I, $r0,     .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   LD a6,  X,   5 * SIZE
+   LD a7,  X,   6 * SIZE
+   LD a8,  X,   7 * SIZE
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   ST a1,  Y,   0 * SIZE
+   LD a1,  X,   8 * SIZE
+   ST a2,  Y,   1 * SIZE
+   LD a2,  X,   9 * SIZE
+   ST a3,  Y,   2 * SIZE
+   LD a3,  X,  10 * SIZE
+   ST a4,  Y,   3 * SIZE
+   LD a4,  X,  11 * SIZE
+   ST a5,  Y,   4 * SIZE
+   LD a5,  X,  12 * SIZE
+   ST a6,  Y,   5 * SIZE
+   LD a6,  X,  13 * SIZE
+   ST a7,  Y,   6 * SIZE
+   LD a7,  X,  14 * SIZE
+   ST a8,  Y,   7 * SIZE
+   LD a8,  X,  15 * SIZE
+   addi.d  I, I, -1
+   addi.d  X, X, 8 * SIZE
+   addi.d Y, Y, 8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   ST a1,  Y,   0 * SIZE
+   ST a2,  Y,   1 * SIZE
+   ST a3,  Y,   2 * SIZE
+   ST a4,  Y,   3 * SIZE
+   ST a5,  Y,   4 * SIZE
+   ST a6,  Y,   5 * SIZE
+   ST a7,  Y,   6 * SIZE
+   ST a8,  Y,   7 * SIZE
+   addi.d  X, X, 8 * SIZE
+   addi.d  Y, Y, 8 * SIZE
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  X, X, SIZE
+   addi.d  I, I, -1
+   addi.d  Y, Y, SIZE
+   ST a1,  Y,  -1 * SIZE
+   blt $r0,    I, .L16
+   b   .L999
+   .align 3
+
+.L20:
+   srai.d  I, N, 3
+   addi.d I, I, -1
+   blt I,  $r0, .L25
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   add.d   X, X, INCX
+   bge $r0,    I, .L23
+   .align 3
+
+.L22:
+   ST a1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a2,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a3,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a4,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a5,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a6,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a7,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a8,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   ST a1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a2,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a3,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a4,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a5,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a6,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a7,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a8,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   .align 3
+
+.L25:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+
+.L26:
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   addi.d  I, I, -1
+   ST a1,  Y,   0 * SIZE
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S
new file mode 100644
index 000000000..41db48bdf
--- /dev/null
+++ b/kernel/loongarch64/dnrm2.S
@@ -0,0 +1,314 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r7
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+#define ALPHA  $f4
+#define max    $f5
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   move    XX, X
+   NOP
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   add.d   X, X, INCX
+   FABS    s1, a1
+   FABS   s2, a1
+   bge $r0,    N, .L999
+   FABS    s3, a1
+   srai.d  I, N, 3
+   FABS   s4, a1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   FABS    t3, a3
+   LD a2,  X,   0 * SIZE
+   FABS    t4, a4
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, t1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, t3
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   FABS    t1, a5
+   LD a5,  X,   0 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+   FABS    t3, a7
+   LD a6,  X,   0 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, t1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, t3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   addi.d  I, I, -1
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   FABS    t1, a5
+   FABS    t2, a6
+   FABS    t3, a7
+   FABS    t4, a8
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L100
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   CMPLT   $fcc0, s1, t1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L100:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   addi.d  N, N, 1
+   lu12i.w TEMP, 0x3f800
+   movgr2fr.d      a1,   $r0
+   movgr2fr.w  ALPHA, TEMP
+   CMPEQ   $fcc0, s1, a1
+   fcvt.d.s   ALPHA, ALPHA
+   bcnez   $fcc0, .L999
+   fdiv.d  ALPHA, ALPHA, s1
+   MOV max, s1
+   MOV s1, a1
+   MOV s2, a1
+   MOV s3, a1
+   MOV s4, a1
+   srai.d  I, N, 3
+   bge $r0,    I, .L105
+   LD a1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a4,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a5,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a6,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a7,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a8,  XX,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  XX, XX, INCX
+   bge $r0,    I, .L104
+   .align 3
+
+.L103:
+   MUL t1, ALPHA, a1
+   LD a1,  XX,   0 * SIZE
+   MUL t2, ALPHA, a2
+   add.d   XX, XX, INCX
+   MUL t3, ALPHA, a3
+   LD a2,  XX,   0 * SIZE
+   MUL t4, ALPHA, a4
+   add.d   XX, XX, INCX
+   MADD  s1, t1, t1, s1
+   LD a3,  XX,   0 * SIZE
+   MADD  s2, t2, t2, s2
+   add.d   XX, XX, INCX
+   MADD  s3, t3, t3, s3
+   LD a4,  XX,   0 * SIZE
+   MADD  s4, t4, t4, s4
+   add.d   XX, XX, INCX
+   MUL t1, ALPHA, a5
+   LD a5,  XX,   0 * SIZE
+   MUL t2, ALPHA, a6
+   add.d   XX, XX, INCX
+   MUL t3, ALPHA, a7
+   LD a6,  XX,   0 * SIZE
+   MUL t4, ALPHA, a8
+   add.d   XX, XX, INCX
+   MADD  s1, t1, t1, s1
+   LD a7,  XX,   0 * SIZE
+   MADD  s2, t2, t2, s2
+   add.d   XX, XX, INCX
+   MADD  s3, t3, t3, s3
+   LD a8,  XX,   0 * SIZE
+   MADD  s4, t4, t4, s4
+   addi.d  I, I, -1
+   add.d  XX, XX, INCX
+   blt $r0,    I, .L103
+   .align 3
+
+.L104:
+   MUL t1, ALPHA, a1
+   MUL t2, ALPHA, a2
+   MUL t3, ALPHA, a3
+   MUL t4, ALPHA, a4
+   MADD  s1, t1, t1, s1
+   MADD  s2, t2, t2, s2
+   MADD  s3, t3, t3, s3
+   MADD  s4, t4, t4, s4
+   MUL t1, ALPHA, a5
+   MUL t2, ALPHA, a6
+   MUL t3, ALPHA, a7
+   MUL t4, ALPHA, a8
+   MADD  s1, t1, t1, s1
+   MADD  s2, t2, t2, s2
+   MADD  s3, t3, t3, s3
+   MADD  s4, t4, t4, s4
+   .align 3
+
+.L105:
+   andi    I,  N, 7
+   bge $r0,    I, .L998
+   .align  3
+
+.L106:
+   LD a1,  XX,   0 * SIZE
+   addi.d  I, I, -1
+   MUL t1, ALPHA, a1
+   add.d   XX, XX, INCX
+   MADD  s1, t1, t1, s1
+   blt $r0,    I, .L106
+   .align 3
+
+.L998:
+   ADD s1, s1, s2
+   ADD s3, s3, s4
+   ADD s1, s1, s3
+   fsqrt.d s1, s1
+   move $r4, $r17
+   MUL $f0, max, s1
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/dot.S b/kernel/loongarch64/dot.S
new file mode 100644
index 000000000..4fcd569c8
--- /dev/null
+++ b/kernel/loongarch64/dot.S
@@ -0,0 +1,391 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f23
+#define a2     $f9
+#define a3     $f10
+#define a4     $f11
+#define b1     $f12
+#define b2     $f13
+#define b3     $f14
+#define b4     $f15
+#define s1     $f22
+#define s2     $f8
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+   LDINT   INCY,  0(INCY)
+#endif
+
+   MTC  s1, $r0
+   MTC  s2, $r0
+   slli.d  INCX, INCX, BASE_SHIFT
+   li  TEMP, SIZE
+   slli.d INCY, INCY, BASE_SHIFT
+   bge $r0,    N, .L999
+   srai.d I, N, 3
+   bne INCX, TEMP, .L20
+   bne INCY, TEMP, .L20
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD b1,  Y,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD b2,  Y,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD b3,  Y,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   addi.d  I, I, -1
+   LD b4,  Y,   3 * SIZE
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   4 * SIZE
+   LD b1,  Y,   4 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a2, a2
+   fcvt.d.s  b2, b2
+   fmadd.d  s2, b2, a2, s2
+#else
+   MADD  s2, b2, a2, s2
+#endif
+   LD a2,  X,   5 * SIZE
+   LD b2,  Y,   5 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a3, a3
+   fcvt.d.s  b3, b3
+   fmadd.d  s1, b3, a3, s1
+#else
+   MADD  s1, b3, a3, s1
+#endif
+   LD a3,  X,   6 * SIZE
+   LD b3,  Y,   6 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a4, a4
+   fcvt.d.s  b4, b4
+   fmadd.d  s2, b4, a4, s2
+#else
+   MADD  s2, b4, a4, s2
+#endif
+   LD a4,  X,   7 * SIZE
+   LD b4,  Y,   7 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   8 * SIZE
+   LD b1,  Y,   8 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a2, a2
+   fcvt.d.s  b2, b2
+   fmadd.d  s2, b2, a2, s2
+#else
+   MADD  s2, b2, a2, s2
+#endif
+   LD a2,  X,   9 * SIZE
+   LD b2,  Y,   9 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a3, a3
+   fcvt.d.s  b3, b3
+   fmadd.d  s1, b3, a3, s1
+#else
+   MADD  s1, b3, a3, s1
+#endif
+   LD a3,  X,  10 * SIZE
+   LD b3,  Y,  10 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a4, a4
+   fcvt.d.s  b4, b4
+   fmadd.d  s2, b4, a4, s2
+#else
+   MADD  s2, b4, a4, s2
+#endif
+   LD a4,  X,  11 * SIZE
+   LD b4,  Y,  11 * SIZE
+   addi.d  I, I, -1
+   addi.d  X, X, 8 * SIZE
+addi.d Y, Y, 8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+.L13:
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   4 * SIZE
+   LD b1,  Y,   4 * SIZE
+#ifdef DSDOT
+   fcvt.d.s a2, a2
+   fcvt.d.s b2, b2
+   fmadd.d  s2, b2, a2, s2
+#else
+   MADD  s2, b2, a2, s2
+#endif
+   LD a2,  X,   5 * SIZE
+   LD b2,  Y,   5 * SIZE
+#ifdef DSDOT
+   fcvt.d.s a3, a3
+   fcvt.d.s b3, b3
+   fmadd.d  s1, b3, a3, s1
+#else
+   MADD  s1, b3, a3, s1
+#endif
+   LD a3,  X,   6 * SIZE
+   LD b3,  Y,   6 * SIZE
+#ifdef DSDOT
+   fcvt.d.s a4, a4
+   fcvt.d.s b4, b4
+   fmadd.d  s2, b4, a4, s2
+#else
+   MADD  s2, b4, a4, s2
+#endif
+   LD a4,  X,   7 * SIZE
+   LD b4,  Y,   7 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   addi.d  X, X, 8 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a2, a2
+   fcvt.d.s  b2, b2
+   fmadd.d  s2, b2, a2, s2
+#else
+   MADD  s2, b2, a2, s2
+#endif
+   addi.d  Y, Y, 8 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a3, a3
+   fcvt.d.s  b3, b3
+   fmadd.d  s1, b3, a3, s1
+#else
+   MADD  s1, b3, a3, s1
+#endif
+#ifdef DSDOT
+   fcvt.d.s  a4, a4
+   fcvt.d.s  b4, b4
+   fmadd.d  s2, b4, a4, s2
+#else
+   MADD  s2, b4, a4, s2
+#endif
+   .align 3
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD b1,  Y,   0 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   addi.d  I, I, -1
+   addi.d  X, X, SIZE
+   addi.d  Y, Y, SIZE
+   blt $r0,    I, .L16
+   b   .L999
+   .align 3
+
+.L20:
+#ifdef F_INTERFACE
+   bgez    INCX, .L21
+   addi.d  TEMP, N, -1
+   mult    TEMP, INCX
+   mflo    TEMP
+   dsub    X, X, TEMP
+   .align 3
+
+.L21:
+   bgez    INCY, .L22
+   addi.d  TEMP, N, -1
+   mult    TEMP, INCY
+   mflo    TEMP
+   dsub    Y, Y, TEMP
+   .align 3
+
+.L22:
+#endif
+   bge $r0,    I, .L25
+   .align 3
+
+.L23:
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s2, b1, a1, s2
+#else
+   MADD  s2, b1, a1, s2
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s2, b1, a1, s2
+#else
+   MADD  s2, b1, a1, s2
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s2, b1, a1, s2
+#else
+   MADD  s2, b1, a1, s2
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   addi.d  I, I, -1
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s2, b1, a1, s2
+#else
+   MADD  s2, b1, a1, s2
+#endif
+   blt $r0,    I, .L23
+   .align 3
+
+.L25:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+
+.L26:
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   addi.d  I, I, -1
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+#ifdef DSDOT
+   fadd.d $f0, s1, s2
+#else
+   ADD    $f0, s1, s2
+#endif
+   move $r4, $r17
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/gemm_kernel.S b/kernel/loongarch64/gemm_kernel.S
new file mode 100644
index 000000000..8926bf123
--- /dev/null
+++ b/kernel/loongarch64/gemm_kernel.S
@@ -0,0 +1,1859 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r30
+#define PREFETCHSIZE (4 * 10)
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define CO5    $r25
+#define CO6    $r26
+#define CO7    $r27
+#define CO8    $r28
+#define BB     $r29
+
+#if defined(TRMMKERNEL)
+#define OFFSET $r11
+#define KK     $r20
+#define TEMP   $r16
+#endif
+
+#define a1     $f22
+#define a2     $f8
+#define a3     $f27
+#define a4     $f28
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f3
+#define c22    $f1
+#define c31    $f2
+#define c32    $f4
+#define c41    $f5
+#define c42    $f6
+#define c51    $f7
+#define c52    $f18
+#define c61    $f19
+#define c62    $f20
+#define c71    $f21
+#define c72    $f24
+#define c81    $f25
+#define c82    $f26
+#define ALPHA  $f0
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -160
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   SDARG  $r29,  $sp,   48
+   SDARG  $r30,  $sp,   96
+   fst.d  $f24,  $sp,  56
+   fst.d  $f25,  $sp,  64
+   fst.d  $f26,  $sp,  72
+   fst.d  $f27,  $sp,  80
+   fst.d  $f28,  $sp,  88
+#if defined(TRMMKERNEL)
+   SDARG  $r20,  $sp,  104
+   SDARG  $r16,  $sp,  112
+#endif
+#ifndef __64BIT__
+   fst.d  $f18,  $sp, 120
+   fst.d  $f19,  $sp, 128
+   fst.d  $f20,  $sp, 136
+   fst.d  $f21,  $sp, 144
+#endif
+   slli.d     LDC,    LDC, BASE_SHIFT
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   sub.d   KK, $r0, OFFSET
+#endif
+   srai.d  J,  N, 3
+nop
+   bge $r0,    J, .L30
+.L10:
+   move    CO1, C
+   MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   move    AO, A
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   CO5, CO4,    LDC
+   MOV c31, c11
+   add.d   CO6, CO5,    LDC
+   MOV c41, c11
+   add.d   CO7, CO6,    LDC
+   MOV c51, c11
+   add.d   CO8, CO7,    LDC
+   srai.d  I,  M, 1
+   add.d   C,   CO8,    LDC
+   slli.d  BB, K, 2 + BASE_SHIFT
+   add.d   BB, B, BB
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+MOV    c61, c11
+   bge $r0,    I, .L20
+.L11:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 2
+#else
+   addi.d  TEMP, KK, 8
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L15
+#else
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   preld  1,  CO1,  3 * SIZE
+   preld  1,  CO2,  3 * SIZE
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  K, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#endif
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   preld  1,  CO3,  2 * SIZE
+   .align  3
+.L12:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD a4,  AO,   2 * SIZE
+   MADD  c61, b2, a1, c61
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD a4,  AO,   6 * SIZE
+   MADD  c61, b2, a3, c61
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   preld  1,  CO4,  3 * SIZE
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   preld  1,  CO5,  3 * SIZE
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   preld  1,  CO6,  3 * SIZE
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   preld  1,  CO7,  3 * SIZE
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   preld  1,  CO8,  3 * SIZE
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   addi.d  CO3,CO3, 2 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   addi.d  CO1,CO1, 2 * SIZE
+   LD $f23,  CO2,  0 * SIZE
+   addi.d  CO4,CO4, 2 * SIZE
+   LD $f9,  CO2,  1 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+   LD $f10,  CO3,  -2 * SIZE
+   addi.d  CO5,CO5, 2 * SIZE
+   LD $f11,  CO3,  -1 * SIZE
+   addi.d  CO6,CO6, 2 * SIZE
+   LD $f12,  CO4,  -2 * SIZE
+   addi.d  CO7,CO7, 2 * SIZE
+   LD $f13,  CO4,  -1 * SIZE
+   addi.d  I, I, -1
+   MADD  c11, c11, ALPHA, $f22
+   LD $f22,  CO5, -2 * SIZE
+   MADD  c12, c12, ALPHA, $f8
+   LD $f8,  CO5, -1 * SIZE
+   MADD  c21, c21, ALPHA, $f23
+   LD $f23,  CO6, -2 * SIZE
+   MADD  c22, c22, ALPHA, $f9
+   LD $f9,  CO6, -1 * SIZE
+   MADD  c31, c31, ALPHA, $f10
+   LD $f10,  CO7, -2 * SIZE
+   MADD  c32, c32, ALPHA, $f11
+   LD $f11,  CO7, -1 * SIZE
+   MADD  c41, c41, ALPHA, $f12
+   LD $f12,  CO8,  0 * SIZE
+   MADD  c42, c42, ALPHA, $f13
+   LD $f13,  CO8,  1 * SIZE
+   preld  0,  BB,  0 * SIZE
+   preld  0,  BB,  8 * SIZE
+   ST c11,  CO1,  -2 * SIZE
+   MTC  c11, $r0
+   ST c12,  CO1,  -1 * SIZE
+   addi.d  CO8,CO8, 2 * SIZE
+   ST c21,  CO2,  -2 * SIZE
+   MOV c21, c11
+   ST c22,  CO2,  -1 * SIZE
+   addi.d  BB, BB, 16 * SIZE
+   MADD  c51, c51, ALPHA, $f22
+   ST c31,  CO3,  -2 * SIZE
+   MADD  c52, c52, ALPHA, $f8
+   ST c32,  CO3,  -1 * SIZE
+   MADD  c61, c61, ALPHA, $f23
+   ST c41,  CO4,  -2 * SIZE
+   MADD  c62, c62, ALPHA, $f9
+   ST c42,  CO4,  -1 * SIZE
+   MADD  c71, c71, ALPHA, $f10
+   ST c51,  CO5,  -2 * SIZE
+   MADD  c72, c72, ALPHA, $f11
+   ST c52,  CO5,  -1 * SIZE
+   MADD  c81, c81, ALPHA, $f12
+   ST c61,  CO6,  -2 * SIZE
+   MADD  c82, c82, ALPHA, $f13
+   ST c62,  CO6,  -1 * SIZE
+   ST c71,  CO7,  -2 * SIZE
+   MOV c31, c11
+   ST c72,  CO7,  -1 * SIZE
+   MOV c41, c11
+   ST c81,  CO8,  -2 * SIZE
+   MOV c51, c11
+   ST c82,  CO8,  -1 * SIZE
+MOV    c61, c11
+   blt $r0,    I, .L11
+#else
+   addi.d  CO4,CO4, 2 * SIZE
+   addi.d  CO5,CO5, 2 * SIZE
+   addi.d  CO6,CO6, 2 * SIZE
+   addi.d  CO7,CO7, 2 * SIZE
+   preld  0,  BB,  0 * SIZE
+   preld  0,  BB,  8 * SIZE
+   MUL c11, ALPHA, c11
+   addi.d  CO1,CO1, 2 * SIZE
+   MUL c12, ALPHA, c12
+   MTC  a1, $r0
+   MUL c21, ALPHA, c21
+   addi.d  CO2,CO2, 2 * SIZE
+   MUL c22, ALPHA, c22
+   addi.d  CO3,CO3, 2 * SIZE
+   ST c11,  CO1,  -2 * SIZE
+   MUL c31, ALPHA, c31
+   ST c12,  CO1,  -1 * SIZE
+   MUL c32, ALPHA, c32
+   ST c21,  CO2,  -2 * SIZE
+   MUL c41, ALPHA, c41
+   ST c22,  CO2,  -1 * SIZE
+   MUL c42, ALPHA, c42
+   ST c31,  CO3,  -2 * SIZE
+   MUL c51, ALPHA, c51
+   ST c32,  CO3,  -1 * SIZE
+   MUL c52, ALPHA, c52
+   ST c41,  CO4,  -2 * SIZE
+   MUL c61, ALPHA, c61
+   ST c42,  CO4,  -1 * SIZE
+   MUL c62, ALPHA, c62
+   ST c51,  CO5,  -2 * SIZE
+   MUL c71, ALPHA, c71
+   ST c52,  CO5,  -1 * SIZE
+   MUL c72, ALPHA, c72
+   ST c61,  CO6,  -2 * SIZE
+   MUL c81, ALPHA, c81
+   ST c62,  CO6,  -1 * SIZE
+   MUL c82, ALPHA, c82
+   ST c71,  CO7,  -2 * SIZE
+   MOV c11, a1
+   ST c72,  CO7,  -1 * SIZE
+   MOV c21, a1
+   addi.d  CO8,CO8, 2 * SIZE
+   addi.d  BB, BB, 16 * SIZE
+   ST c81,  CO8,  -2 * SIZE
+   MOV c31, a1
+   ST c82,  CO8,  -1 * SIZE
+   MOV c41, a1
+   addi.d  I, I, -1
+   MOV c51, a1
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -2
+#else
+   addi.d  TEMP, TEMP, -8
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 2
+#endif
+MOV    c61, a1
+   blt $r0,    I, .L11
+#endif
+   .align 3
+
+.L20:
+   andi    I,  M, 1
+   MOV c61, c11
+MOV    c71, c11
+   bge $r0,    I, .L29
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 8
+#endif
+   srai.d  L,  TEMP, 2
+MOV    c81, c11
+   bge $r0,    L, .L25
+#else
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  K, 2
+   MOV c81, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,  20 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   9 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,  10 * SIZE
+   MADD  c81, b4, a1, c81
+   LD b4,  BO,  11 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a2, c51
+   LD b7,  BO,  28 * SIZE
+   MADD  c61, b2, a2, c61
+   LD b2,  BO,  17 * SIZE
+   MADD  c71, b3, a2, c71
+   LD b3,  BO,  18 * SIZE
+   MADD  c81, b4, a2, c81
+   LD b4,  BO,  19 * SIZE
+   LD a2,  AO,   5 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,  32 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,  21 * SIZE
+   MADD  c31, b3, a3, c31
+   LD b3,  BO,  22 * SIZE
+   MADD  c41, b4, a3, c41
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD b5,  BO,  36 * SIZE
+   MADD  c61, b2, a3, c61
+   LD b2,  BO,  25 * SIZE
+   MADD  c71, b3, a3, c71
+   LD b3,  BO,  26 * SIZE
+   MADD  c81, b4, a3, c81
+   LD b4,  BO,  27 * SIZE
+   LD a3,  AO,   2 * SIZE
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c11, b6, a4, c11
+   LD b6,  BO,   8 * SIZE
+   MADD  c21, b2, a4, c21
+   LD b2,  BO,  -3 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  -2 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  -1 * SIZE
+   MADD  c51, b7, a4, c51
+   LD b7,  BO,  12 * SIZE
+   MADD  c61, b2, a4, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a4, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a4, c81
+   LD b4,  BO,   3 * SIZE
+   LD a4,  AO,   3 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   MOV a2, a2
+   addi.d  AO, AO,  1 * SIZE
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,   4 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L26
+.L28:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO2,  0 * SIZE
+   LD $f23,  CO3,  0 * SIZE
+   LD $f9,  CO4,  0 * SIZE
+   MADD  c11, c11, ALPHA, $f22
+   LD $f10,  CO5,  0 * SIZE
+   MADD  c21, c21, ALPHA, $f8
+   LD $f11,  CO6,  0 * SIZE
+   MADD  c31, c31, ALPHA, $f23
+   LD $f12,  CO7,  0 * SIZE
+   MADD  c41, c41, ALPHA, $f9
+   LD $f13,  CO8,  0 * SIZE
+   MADD  c51, c51, ALPHA, $f10
+   ST c11,  CO1,   0 * SIZE
+   MADD  c61, c61, ALPHA, $f11
+   ST c21,  CO2,   0 * SIZE
+   MADD  c71, c71, ALPHA, $f12
+   ST c31,  CO3,   0 * SIZE
+   MADD  c81, c81, ALPHA, $f13
+   ST c41,  CO4,   0 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c81,  CO8,   0 * SIZE
+#else
+   MUL c11, ALPHA, c11
+   MUL c21, ALPHA, c21
+   MUL c31, ALPHA, c31
+   MUL c41, ALPHA, c41
+   ST c11,  CO1,   0 * SIZE
+   MUL c51, ALPHA, c51
+   ST c21,  CO2,   0 * SIZE
+   MUL c61, ALPHA, c61
+   ST c31,  CO3,   0 * SIZE
+   MUL c71, ALPHA, c71
+   ST c41,  CO4,   0 * SIZE
+   MUL c81, ALPHA, c81
+   ST c51,  CO5,   0 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c81,  CO8,   0 * SIZE
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -1
+#else
+   addi.d  TEMP, TEMP, -8
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 1
+#endif
+#endif
+   .align 3
+
+.L29:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 8
+#endif
+move   B, BO
+   blt $r0,    J, .L10
+   .align 3
+
+.L30:
+   andi    J,  N, 4
+move   AO, A
+   bge $r0,    J, .L50
+   move    CO1, C
+   MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   C,   CO4,    LDC
+   MOV c31, c11
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+   srai.d  I,  M, 1
+MOV    c41, c11
+   bge $r0,    I, .L40
+.L31:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c32, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c42, c11
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 2
+#else
+   addi.d  TEMP, KK, 4
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L35
+#else
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   MOV c32, c11
+   LD b4,  B,   3 * SIZE
+   MOV c42, c11
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  K, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c11, b7, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c31, b3, a3, c31
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD  c12, b7, a2, c12
+   LD b7,  BO,  12 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   0 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   4 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L36
+.L38:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   addi.d  CO3,CO3, 2 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   addi.d  CO1,CO1, 2 * SIZE
+   LD $f23,  CO2,  0 * SIZE
+   addi.d  CO4,CO4, 2 * SIZE
+   LD $f9,  CO2,  1 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+   LD $f10,  CO3,  -2 * SIZE
+   MADD  c11, c11, ALPHA, $f22
+   LD $f11,  CO3,  -1 * SIZE
+   MADD  c12, c12, ALPHA, $f8
+   LD $f12,  CO4,  -2 * SIZE
+   MADD  c21, c21, ALPHA, $f23
+   LD $f13,  CO4,  -1 * SIZE
+   MADD  c22, c22, ALPHA, $f9
+   MADD  c31, c31, ALPHA, $f10
+   ST c11,  CO1,  -2 * SIZE
+   MADD  c32, c32, ALPHA, $f11
+   ST c12,  CO1,  -1 * SIZE
+   MADD  c41, c41, ALPHA, $f12
+   ST c21,  CO2,  -2 * SIZE
+   MADD  c42, c42, ALPHA, $f13
+   ST c22,  CO2,  -1 * SIZE
+   ST c31,  CO3,  -2 * SIZE
+   MTC  c11, $r0
+   ST c32,  CO3,  -1 * SIZE
+   addi.d  I, I, -1
+   ST c41,  CO4,  -2 * SIZE
+   MOV c21, c11
+   ST c42,  CO4,  -1 * SIZE
+   MOV c31, c11
+#else
+   MUL c11, ALPHA, c11
+   addi.d  CO3,CO3, 2 * SIZE
+   MUL c12, ALPHA, c12
+   addi.d  CO1,CO1, 2 * SIZE
+   MUL c21, ALPHA, c21
+   addi.d  CO4,CO4, 2 * SIZE
+   MUL c22, ALPHA, c22
+   addi.d  CO2,CO2, 2 * SIZE
+   ST c11,  CO1,  -2 * SIZE
+   MUL c31, ALPHA, c31
+   ST c12,  CO1,  -1 * SIZE
+   MUL c32, ALPHA, c32
+   ST c21,  CO2,  -2 * SIZE
+   MUL c41, ALPHA, c41
+   ST c22,  CO2,  -1 * SIZE
+   MUL c42, ALPHA, c42
+   ST c31,  CO3,  -2 * SIZE
+   MTC  c11, $r0
+   ST c32,  CO3,  -1 * SIZE
+   addi.d  I, I, -1
+   ST c41,  CO4,  -2 * SIZE
+   MOV c21, c11
+   ST c42,  CO4,  -1 * SIZE
+   MOV c31, c11
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -2
+#else
+   addi.d  TEMP, TEMP, -4
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 2
+#endif
+#endif
+MOV    c41, c11
+   blt $r0,    I, .L31
+   .align 3
+
+.L40:
+   andi    I,  M, 1
+MOV    c61, c11
+   bge $r0,    I, .L49
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 4
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L45
+#else
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  K, 2
+move   BO,  B
+   bge $r0,    L, .L45
+#endif
+   .align  3
+.L42:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b5, a2, c11
+   LD b5,  BO,  20 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  11 * SIZE
+   LD a2,  AO,   2 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   LD a2,  AO,  -1 * SIZE
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c11, b7, a2, c11
+   LD b7,  BO,  12 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   1 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   2 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   3 * SIZE
+   LD a2,  AO,   1 * SIZE
+   blt $r0,    L, .L42
+   .align 3
+
+.L45:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L48
+   .align  3
+.L46:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   1 * SIZE
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+   MOV a2, a2
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L46
+.L48:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO2,  0 * SIZE
+   LD $f23,  CO3,  0 * SIZE
+   LD $f9,  CO4,  0 * SIZE
+   MADD  c11, c11, ALPHA, $f22
+   MADD  c21, c21, ALPHA, $f8
+   MADD  c31, c31, ALPHA, $f23
+   MADD  c41, c41, ALPHA, $f9
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+#else
+   MUL c11, ALPHA, c11
+   MUL c21, ALPHA, c21
+   MUL c31, ALPHA, c31
+   MUL c41, ALPHA, c41
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -1
+#else
+   addi.d  TEMP, TEMP, -4
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 1
+#endif
+#endif
+   .align 3
+
+.L49:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 4
+#endif
+   move    B, BO
+   .align 3
+
+.L50:
+   andi    J,  N, 2
+move   AO, A
+   bge $r0,    J, .L70
+   move    CO1, C
+   add.d   CO2, C,      LDC
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+   srai.d  I,  M, 1
+add.d  C,   CO2,    LDC
+   bge $r0,    I, .L60
+.L51:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 2
+#else
+   addi.d  TEMP, KK, 2
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L55
+#else
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  K, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L55
+#endif
+   .align  3
+.L52:
+   MADD  c11, b1, a1, c11
+   LD a3,  AO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b4,  BO,   3 * SIZE
+   MADD  c12, b1, a2, c12
+   LD a4,  AO,   3 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b1,  BO,   8 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a1,  AO,   8 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   5 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,   6 * SIZE
+   MADD  c11, b5, a5, c11
+   LD a3,  AO,   6 * SIZE
+   MADD  c21, b2, a5, c21
+   LD b4,  BO,   7 * SIZE
+   MADD  c12, b5, a2, c12
+   LD a4,  AO,   7 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b5,  BO,  12 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a5,  AO,  12 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   9 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,  10 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L52
+   .align 3
+
+.L55:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L58
+   .align  3
+.L56:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   3 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L56
+.L58:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   addi.d  I, I, -1
+   LD $f8,  CO1,  1 * SIZE
+   addi.d  CO1,CO1, 2 * SIZE
+   LD $f23,  CO2,  0 * SIZE
+   LD $f9,  CO2,  1 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+   MADD  c11, c11, ALPHA, $f22
+   MADD  c12, c12, ALPHA, $f8
+   MADD  c21, c21, ALPHA, $f23
+   MADD  c22, c22, ALPHA, $f9
+   ST c11,  CO1,  -2 * SIZE
+   ST c12,  CO1,  -1 * SIZE
+   ST c21,  CO2,  -2 * SIZE
+   ST c22,  CO2,  -1 * SIZE
+   blt $r0,    I, .L51
+#else
+   addi.d  I, I, -1
+   addi.d  CO1,CO1, 2 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+   MUL c11, ALPHA, c11
+   MUL c12, ALPHA, c12
+   MUL c21, ALPHA, c21
+   MUL c22, ALPHA, c22
+   ST c11,  CO1,  -2 * SIZE
+   ST c12,  CO1,  -1 * SIZE
+   ST c21,  CO2,  -2 * SIZE
+   ST c22,  CO2,  -1 * SIZE
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -2
+#else
+   addi.d  TEMP, TEMP, -2
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 2
+#endif
+   blt $r0,    I, .L51
+#endif
+   .align 3
+
+.L60:
+   andi    I,  M, 1
+   bge $r0,    I, .L69
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 2
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L65
+#else
+   srai.d  L,  K, 2
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L65
+#endif
+   .align  3
+.L62:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  11 * SIZE
+   LD a3,  AO,   6 * SIZE
+   LD a4,  AO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L62
+   .align 3
+
+.L65:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L68
+   .align  3
+.L66:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   3 * SIZE
+   LD a1,  AO,   1 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L66
+.L68:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO2,  0 * SIZE
+   ADD c11, c11, c31
+   ADD c21, c21, c41
+   MADD  c11, c11, ALPHA, $f22
+   MADD  c21, c21, ALPHA, $f8
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+#else
+   ADD c11, c11, c31
+   ADD c21, c21, c41
+   MUL c11, ALPHA, c11
+   MUL c21, ALPHA, c21
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -1
+#else
+   addi.d  TEMP, TEMP, -2
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 1
+#endif
+#endif
+   .align 3
+
+.L69:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 2
+#endif
+   move    B, BO
+   .align 3
+
+.L70:
+   andi    J,  N, 1
+move   AO, A
+   bge $r0,    J, .L999
+   move    CO1, C
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+   srai.d  I,  M, 1
+add.d  C,   CO1,    LDC
+   bge $r0,    I, .L80
+.L71:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 0 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 2
+#else
+   addi.d  TEMP, KK, 1
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L75
+#else
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  K, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L75
+#endif
+   .align  3
+.L72:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   2 * SIZE
+   LD a2,  AO,   3 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   6 * SIZE
+   LD a2,  AO,   7 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  8 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L72
+   .align 3
+
+.L75:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L78
+   .align  3
+.L76:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L76
+.L78:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   addi.d  I, I, -1
+   LD $f8,  CO1,  1 * SIZE
+   addi.d  CO1,CO1, 2 * SIZE
+   ADD c11, c11, c21
+   ADD c12, c12, c22
+   MADD  c11, c11, ALPHA, $f22
+   MADD  c12, c12, ALPHA, $f8
+   ST c11,  CO1,  -2 * SIZE
+   ST c12,  CO1,  -1 * SIZE
+   blt $r0,    I, .L71
+#else
+   ADD c11, c11, c21
+   addi.d  I, I, -1
+   ADD c12, c12, c22
+   addi.d  CO1,CO1, 2 * SIZE
+   MUL c11, ALPHA, c11
+   MUL c12, ALPHA, c12
+   ST c11,  CO1,  -2 * SIZE
+   ST c12,  CO1,  -1 * SIZE
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -2
+#else
+   addi.d  TEMP, TEMP, -1
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 2
+#endif
+   blt $r0,    I, .L71
+#endif
+   .align 3
+
+.L80:
+   andi    I,  M, 1
+   bge $r0,    I, .L89
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 0 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 1
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L85
+#else
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  K, 2
+move   BO,  B
+   bge $r0,    L, .L85
+#endif
+   .align  3
+.L82:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   1 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c21, b1, a1, c21
+   LD a1,  AO,   2 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   3 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c21, b1, a1, c21
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L82
+   .align 3
+
+.L85:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L88
+   .align  3
+.L86:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L86
+.L88:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   ADD c11, c11, c21
+   MADD  c11, c11, ALPHA, $f22
+   ST c11,  CO1,   0 * SIZE
+#else
+   ADD c11, c11, c21
+   MUL c11, ALPHA, c11
+   ST c11,  CO1,   0 * SIZE
+#endif
+   .align 3
+
+.L89:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 1
+#endif
+   move    B, BO
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   LDARG  $r29,  $sp,   48
+   LDARG  $r30,  $sp,   96
+   fld.d  $f24,  $sp,  56
+   fld.d  $f25,  $sp,  64
+   fld.d  $f26,  $sp,  72
+   fld.d  $f27,  $sp,  80
+   fld.d  $f28,  $sp,  88
+#if defined(TRMMKERNEL)
+   LDARG  $r20,  $sp,  104
+   LDARG  $r16,  $sp,  112
+#endif
+#ifndef __64BIT__
+   fld.d  $f18,  $sp, 120
+   fld.d  $f19,  $sp, 128
+   fld.d  $f20,  $sp, 136
+   fld.d  $f21,  $sp, 144
+#endif
+   addi.d  $sp, $sp, 160
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/gemv_n.S b/kernel/loongarch64/gemv_n.S
new file mode 100644
index 000000000..334a2991f
--- /dev/null
+++ b/kernel/loongarch64/gemv_n.S
@@ -0,0 +1,531 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Unused param dummy1 */
+#define M      $r4
+#define N      $r5
+#define A      $r7
+#define LDA    $r8
+#define X      $r9
+#define INCX   $r10
+#define Y      $r11
+#define INCY   $r6
+#define BUFFER $r16
+#define YORIG  $r18
+#define XX     $r12
+#define YY     $r13
+#define I      $r14
+#define J      $r15
+#define AO1    $r23
+#define AO2    $r24
+#define ALPHA  $f0
+#define a1     $f22
+#define a2     $f8
+#define a3     $f23
+#define a4     $f9
+#define a5     $f10
+#define a6     $f11
+#define a7     $f12
+#define a8     $f13
+#define x1     $f14
+#define x2     $f15
+#define y1     $f16
+#define y2     $f17
+#define y3     $f3
+#define y4     $f1
+#define y5     $f2
+#define y6     $f4
+#define y7     $f5
+#define y8     $f6
+#define t1     $f7
+#define t2     $f18
+#define t3     $f19
+#define t4     $f20
+
+   PROLOGUE
+
+   LDARG  INCY,    $sp,  0
+   LDARG  BUFFER,  $sp,  8
+#ifdef __64BIT__
+   addi.d  $sp, $sp, -16
+#else
+   addi.d  $sp, $sp, -48
+#endif
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   slli.d     LDA,     LDA,  BASE_SHIFT
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  16
+   fst.d  $f19,  $sp,  24
+   fst.d  $f20,  $sp,  32
+#endif
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    M, .L999
+   slli.d INCY, INCY, BASE_SHIFT
+   bge $r0,    N, .L999
+   li  I, SIZE
+   move   YORIG, Y
+   beq INCY, I, .L10
+   srai.d  I,  M, 2
+   move    YORIG, BUFFER
+   move    XX, Y
+   move   YY, BUFFER
+   bge $r0,    I, .L05
+   .align 3
+
+.L02:
+   LD a1,  XX,  0 * SIZE
+   add.d   XX, XX, INCY
+   LD a2,  XX,  0 * SIZE
+   add.d   XX, XX, INCY
+   LD a3,  XX,  0 * SIZE
+   add.d   XX, XX, INCY
+   LD a4,  XX,  0 * SIZE
+   add.d   XX, XX, INCY
+   ST a1,  YY,  0 * SIZE
+   ST a2,  YY,  1 * SIZE
+   ST a3,  YY,  2 * SIZE
+   ST a4,  YY,  3 * SIZE
+   addi.d  I, I, -1
+   addi.d YY, YY, 4 * SIZE
+   blt $r0,    I, .L02
+   .align 3
+
+.L05:
+   andi    I,  M, 3
+   bge $r0,    I, .L10
+   .align 3
+
+.L06:
+   LD a1,  XX,  0 * SIZE
+   add.d   XX, XX, INCY
+   ST a1,  YY,  0 * SIZE
+   addi.d  I, I, -1
+   addi.d YY, YY, 1 * SIZE
+   blt $r0,    I, .L06
+   .align 3
+
+.L10:
+   srai.d  J,  N, 1
+   bge $r0,    J, .L20
+   .align 3
+
+.L11:
+   LD x1,  X,  0 * SIZE
+   add.d   X, X, INCX
+   LD x2,  X,  0 * SIZE
+   add.d   X, X, INCX
+   move    AO1, A
+   add.d   AO2, A,      LDA
+   add.d   A,   AO2,    LDA
+   move    YY, YORIG
+   MUL x1, ALPHA, x1
+   srai.d  I,  M, 3
+   MUL    x2, ALPHA, x2
+   bge $r0,    I, .L15
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD y4,  YY,  3 * SIZE
+   LD a5,  AO2,  0 * SIZE
+   LD y5,  YY,  4 * SIZE
+   LD a6,  AO2,  1 * SIZE
+   LD y6,  YY,  5 * SIZE
+   LD a7,  AO2,  2 * SIZE
+   LD y7,  YY,  6 * SIZE
+   LD a8,  AO2,  3 * SIZE
+   addi.d  I, I, -1
+   LD y8,  YY,  7 * SIZE
+   bge $r0,    I, .L13
+   .align  3
+.L12:
+   MADD  t1, a1, x1, y1
+   LD a1,  AO1,   4 * SIZE
+   MADD  t2, a2, x1, y2
+   LD a2,  AO1,   5 * SIZE
+   LD y1,  YY,   8 * SIZE
+   LD y2,  YY,   9 * SIZE
+   MADD  t3, a3, x1, y3
+   LD a3,  AO1,   6 * SIZE
+   MADD  t4, a4, x1, y4
+   LD a4,  AO1,   7 * SIZE
+   LD y3,  YY,  10 * SIZE
+   LD y4,  YY,  11 * SIZE
+   MADD  t1, a5, x2, t1
+   LD a5,  AO2,   4 * SIZE
+   MADD  t2, a6, x2, t2
+   LD a6,  AO2,   5 * SIZE
+   MADD  t3, a7, x2, t3
+   LD a7,  AO2,   6 * SIZE
+   MADD  t4, a8, x2, t4
+   LD a8,  AO2,   7 * SIZE
+   ST t1,  YY,   0 * SIZE
+   ST t2,  YY,   1 * SIZE
+   ST t3,  YY,   2 * SIZE
+   ST t4,  YY,   3 * SIZE
+   MADD  t1, a1, x1, y5
+   LD a1,  AO1,   8 * SIZE
+   MADD  t2, a2, x1, y6
+   LD a2,  AO1,   9 * SIZE
+   LD y5,  YY,  12 * SIZE
+   LD y6,  YY,  13 * SIZE
+   MADD  t3, a3, x1, y7
+   LD a3,  AO1,  10 * SIZE
+   MADD  t4, a4, x1, y8
+   LD a4,  AO1,  11 * SIZE
+   LD y7,  YY,  14 * SIZE
+   LD y8,  YY,  15 * SIZE
+   MADD  t1, a5, x2, t1
+   LD a5,  AO2,   8 * SIZE
+   MADD  t2, a6, x2, t2
+   LD a6,  AO2,   9 * SIZE
+   MADD  t3, a7, x2, t3
+   LD a7,  AO2,  10 * SIZE
+   MADD  t4, a8, x2, t4
+   LD a8,  AO2,  11 * SIZE
+   ST t1,  YY,  4 * SIZE
+   ST t2,  YY,  5 * SIZE
+   ST t3,  YY,  6 * SIZE
+   ST t4,  YY,  7 * SIZE
+   addi.d  I, I, -1
+   addi.d  YY,  YY,   8 * SIZE
+   addi.d  AO1, AO1,  8 * SIZE
+   addi.d AO2, AO2,  8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   MADD  t1, a1, x1, y1
+   LD a1,  AO1,   4 * SIZE
+   MADD  t2, a2, x1, y2
+   LD a2,  AO1,   5 * SIZE
+   MADD  t3, a3, x1, y3
+   LD a3,  AO1,   6 * SIZE
+   MADD  t4, a4, x1, y4
+   LD a4,  AO1,   7 * SIZE
+   MADD  t1, a5, x2, t1
+   LD a5,  AO2,   4 * SIZE
+   MADD  t2, a6, x2, t2
+   LD a6,  AO2,   5 * SIZE
+   MADD  t3, a7, x2, t3
+   LD a7,  AO2,   6 * SIZE
+   MADD  t4, a8, x2, t4
+   LD a8,  AO2,   7 * SIZE
+   ST t1,  YY,   0 * SIZE
+   MADD  t1, a1, x1, y5
+   ST t2,  YY,   1 * SIZE
+   MADD  t2, a2, x1, y6
+   ST t3,  YY,   2 * SIZE
+   MADD  t3, a3, x1, y7
+   ST t4,  YY,   3 * SIZE
+   MADD  t4, a4, x1, y8
+   MADD  t1, a5, x2, t1
+   addi.d  AO1, AO1,  8 * SIZE
+   MADD  t2, a6, x2, t2
+   addi.d  AO2, AO2,  8 * SIZE
+   MADD  t3, a7, x2, t3
+   addi.d  YY,  YY,   8 * SIZE
+   MADD  t4, a8, x2, t4
+   ST t1,  YY,  -4 * SIZE
+   ST t2,  YY,  -3 * SIZE
+   ST t3,  YY,  -2 * SIZE
+   ST t4,  YY,  -1 * SIZE
+   .align 3
+
+.L15:
+   andi    I,  M, 4
+   bge $r0,    I, .L16
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD y4,  YY,  3 * SIZE
+   LD a5,  AO2,  0 * SIZE
+   MADD  y1, a1, x1, y1
+   LD a6,  AO2,  1 * SIZE
+   MADD  y2, a2, x1, y2
+   LD a7,  AO2,  2 * SIZE
+   MADD  y3, a3, x1, y3
+   LD a8,  AO2,  3 * SIZE
+   MADD  y4, a4, x1, y4
+   MADD  y1, a5, x2, y1
+   addi.d  YY,  YY,   4 * SIZE
+   MADD  y2, a6, x2, y2
+   addi.d  AO1, AO1,  4 * SIZE
+   MADD  y3, a7, x2, y3
+   addi.d  AO2, AO2,  4 * SIZE
+   MADD  y4, a8, x2, y4
+   ST y1,  YY,  -4 * SIZE
+   ST y2,  YY,  -3 * SIZE
+   ST y3,  YY,  -2 * SIZE
+   ST y4,  YY,  -1 * SIZE
+   .align 3
+
+.L16:
+   andi    I,  M, 2
+   bge $r0,    I, .L17
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a5,  AO2,  0 * SIZE
+   LD a6,  AO2,  1 * SIZE
+   MADD  y1, a1, x1, y1
+   MADD  y2, a2, x1, y2
+   addi.d  YY,  YY,   2 * SIZE
+   MADD  y1, a5, x2, y1
+   addi.d  AO1, AO1,  2 * SIZE
+   MADD  y2, a6, x2, y2
+   addi.d  AO2, AO2,  2 * SIZE
+   ST y1,  YY,  -2 * SIZE
+   ST y2,  YY,  -1 * SIZE
+   .align 3
+
+.L17:
+   andi    I,  M, 1
+   bge $r0,    I, .L19
+   LD y1,  YY,  0 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a5,  AO2,  0 * SIZE
+   MADD  y1, a1, x1, y1
+   MADD  y1, a5, x2, y1
+   ST y1,  YY,  0 * SIZE
+   .align 3
+
+.L19:
+   addi.d  J, J, -1
+   blt $r0,    J, .L11
+   .align 3
+
+.L20:
+   andi    J,  N, 1
+   bge $r0,    J, .L900
+   .align 3
+
+.L21:
+   LD x1,  X,  0 * SIZE
+   add.d   X, X, INCX
+   move    YY, YORIG
+   move    AO1, A
+   srai.d  I,  M, 3
+   MUL    x1, ALPHA, x1
+   bge $r0,    I, .L25
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD y4,  YY,  3 * SIZE
+   LD y5,  YY,  4 * SIZE
+   LD y6,  YY,  5 * SIZE
+   LD y7,  YY,  6 * SIZE
+   addi.d  I, I, -1
+   LD y8,  YY,  7 * SIZE
+   bge $r0,    I, .L23
+   .align  3
+.L22:
+   MADD  t1, a1, x1, y1
+   LD a1,  AO1,   4 * SIZE
+   MADD  t2, a2, x1, y2
+   LD a2,  AO1,   5 * SIZE
+   LD y1,  YY,   8 * SIZE
+   LD y2,  YY,   9 * SIZE
+   MADD  t3, a3, x1, y3
+   LD a3,  AO1,   6 * SIZE
+   MADD  t4, a4, x1, y4
+   LD a4,  AO1,   7 * SIZE
+   LD y3,  YY,  10 * SIZE
+   LD y4,  YY,  11 * SIZE
+   ST t1,  YY,   0 * SIZE
+   ST t2,  YY,   1 * SIZE
+   ST t3,  YY,   2 * SIZE
+   ST t4,  YY,   3 * SIZE
+   MADD  t1, a1, x1, y5
+   LD a1,  AO1,   8 * SIZE
+   MADD  t2, a2, x1, y6
+   LD a2,  AO1,   9 * SIZE
+   LD y5,  YY,  12 * SIZE
+   LD y6,  YY,  13 * SIZE
+   MADD  t3, a3, x1, y7
+   LD a3,  AO1,  10 * SIZE
+   MADD  t4, a4, x1, y8
+   LD a4,  AO1,  11 * SIZE
+   LD y7,  YY,  14 * SIZE
+   LD y8,  YY,  15 * SIZE
+   ST t1,  YY,  4 * SIZE
+   ST t2,  YY,  5 * SIZE
+   ST t3,  YY,  6 * SIZE
+   ST t4,  YY,  7 * SIZE
+   addi.d  I, I, -1
+   addi.d  YY,  YY,   8 * SIZE
+   addi.d AO1, AO1,  8 * SIZE
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   MADD  t1, a1, x1, y1
+   LD a1,  AO1,   4 * SIZE
+   MADD  t2, a2, x1, y2
+   LD a2,  AO1,   5 * SIZE
+   MADD  t3, a3, x1, y3
+   LD a3,  AO1,   6 * SIZE
+   MADD  t4, a4, x1, y4
+   LD a4,  AO1,   7 * SIZE
+   ST t1,  YY,   0 * SIZE
+   MADD  t1, a1, x1, y5
+   ST t2,  YY,   1 * SIZE
+   MADD  t2, a2, x1, y6
+   ST t3,  YY,   2 * SIZE
+   MADD  t3, a3, x1, y7
+   ST t4,  YY,   3 * SIZE
+   MADD  t4, a4, x1, y8
+   ST t1,  YY,   4 * SIZE
+   ST t2,  YY,   5 * SIZE
+   ST t3,  YY,   6 * SIZE
+   ST t4,  YY,   7 * SIZE
+   addi.d  AO1, AO1,  8 * SIZE
+   addi.d  YY,  YY,   8 * SIZE
+   .align 3
+
+.L25:
+   andi    I,  M, 4
+   bge $r0,    I, .L26
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD y4,  YY,  3 * SIZE
+   MADD  y1, a1, x1, y1
+   MADD  y2, a2, x1, y2
+   MADD  y3, a3, x1, y3
+   addi.d  YY,  YY,   4 * SIZE
+   MADD  y4, a4, x1, y4
+   addi.d  AO1, AO1,  4 * SIZE
+   ST y1,  YY,  -4 * SIZE
+   ST y2,  YY,  -3 * SIZE
+   ST y3,  YY,  -2 * SIZE
+   ST y4,  YY,  -1 * SIZE
+   .align 3
+
+.L26:
+   andi    I,  M, 2
+   bge $r0,    I, .L27
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   MADD  y1, a1, x1, y1
+   addi.d  YY,  YY,   2 * SIZE
+   MADD  y2, a2, x1, y2
+   addi.d  AO1, AO1,  2 * SIZE
+   ST y1,  YY,  -2 * SIZE
+   ST y2,  YY,  -1 * SIZE
+   .align 3
+
+.L27:
+   andi    I,  M, 1
+   bge $r0,    I, .L900
+   LD y1,  YY,  0 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   MADD  y1, a1, x1, y1
+   ST y1,  YY,  0 * SIZE
+   .align 3
+
+.L900:
+   li  YORIG, SIZE
+   srai.d I,  M, 2
+   beq INCY, YORIG, .L999
+   move   XX, BUFFER
+   bge $r0,    I, .L905
+   .align 3
+
+.L902:
+   LD a1,  XX,  0 * SIZE
+   LD a2,  XX,  1 * SIZE
+   LD a3,  XX,  2 * SIZE
+   LD a4,  XX,  3 * SIZE
+   ST a1,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   ST a2,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   ST a3,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   ST a4,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   addi.d  I, I, -1
+   addi.d XX, XX, 4 * SIZE
+   blt $r0,    I, .L902
+   .align 3
+
+.L905:
+   andi    I,  M, 3
+   bge $r0,    I, .L999
+   .align 3
+
+.L906:
+   LD a1,  XX,  0 * SIZE
+   addi.d  XX, XX, 1 * SIZE
+   ST a1,  Y,  0 * SIZE
+   addi.d  I, I, -1
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L906
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  16
+   fld.d  $f19,  $sp,  24
+   fld.d  $f20,  $sp,  32
+#endif
+#ifdef __64BIT__
+   addi.d  $sp, $sp, 16
+#else
+   addi.d  $sp, $sp, 48
+#endif
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/gemv_t.S b/kernel/loongarch64/gemv_t.S
new file mode 100644
index 000000000..19333ed4a
--- /dev/null
+++ b/kernel/loongarch64/gemv_t.S
@@ -0,0 +1,436 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Unused param dummy1 */
+#define M      $r4
+#define N      $r5
+#define A      $r7
+#define LDA    $r8
+#define X      $r9
+#define INCX   $r10
+#define Y      $r11
+#define INCY   $r6
+#define BUFFER $r16
+#define XORIG  $r18
+#define XX     $r12
+#define YY     $r13
+#define I      $r14
+#define J      $r15
+#define AO1    $r23
+#define AO2    $r24
+#define ALPHA  $f0
+#define a1     $f22
+#define a2     $f8
+#define a3     $f23
+#define a4     $f9
+#define a5     $f10
+#define a6     $f11
+#define a7     $f12
+#define a8     $f13
+#define y1     $f14
+#define y2     $f15
+#define y3     $f16
+#define y4     $f17
+#define x1     $f3
+#define x2     $f1
+#define x3     $f2
+#define x4     $f4
+#define x5     $f5
+#define x6     $f6
+#define x7     $f7
+#define x8     $f18
+
+   PROLOGUE
+
+   LDARG  INCY,    $sp,  0
+   LDARG  BUFFER,  $sp,  8
+#ifdef __64BIT__
+   addi.d  $sp, $sp, -16
+#else
+   addi.d  $sp, $sp, -32
+#endif
+   MTC  y1, $r0
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   slli.d     LDA,     LDA,  BASE_SHIFT
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  16
+#endif
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    M, .L999
+   slli.d INCY, INCY, BASE_SHIFT
+   bge $r0,    N, .L999
+   li  I, SIZE
+   move   XORIG, X
+   beq INCX, I, .L10
+   srai.d  I,  M, 2
+   move    XORIG, BUFFER
+   move   YY, BUFFER
+   bge $r0,    I, .L05
+   .align 3
+
+.L02:
+   LD a1,  X,  0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,  0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,  0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,  0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  YY,  0 * SIZE
+   ST a2,  YY,  1 * SIZE
+   ST a3,  YY,  2 * SIZE
+   ST a4,  YY,  3 * SIZE
+   addi.d  I, I, -1
+   addi.d YY, YY, 4 * SIZE
+   blt $r0,    I, .L02
+   .align 3
+
+.L05:
+   andi    I,  M, 3
+   bge $r0,    I, .L10
+   .align 3
+
+.L06:
+   LD a1,  X,  0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  YY,  0 * SIZE
+   addi.d  I, I, -1
+   addi.d YY, YY, 1 * SIZE
+   blt $r0,    I, .L06
+   .align 3
+
+.L10:
+   srai.d  J,  N, 1
+   move   YY, Y
+   bge $r0,    J, .L20
+   .align 3
+
+.L11:
+   move    AO1, A
+   MOV y2, y1
+   add.d   AO2, A,      LDA
+   MOV y3, y1
+   add.d   A,   AO2,    LDA
+   MOV y4, y1
+   srai.d  I,  M, 3
+   move   XX, XORIG
+   bge $r0,    I, .L15
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a2,  AO2,  0 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a3,  AO1,  1 * SIZE
+   LD x3,  XX,  2 * SIZE
+   LD a4,  AO2,  1 * SIZE
+   LD x4,  XX,  3 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD x5,  XX,  4 * SIZE
+   LD a6,  AO2,  2 * SIZE
+   LD x6,  XX,  5 * SIZE
+   LD a7,  AO1,  3 * SIZE
+   LD x7,  XX,  6 * SIZE
+   LD a8,  AO2,  3 * SIZE
+   addi.d  I, I, -1
+   LD x8,  XX,  7 * SIZE
+   bge $r0,    I, .L13
+   .align  3
+.L12:
+   MADD  y1, a1, x1, y1
+   LD a1,  AO1,  4 * SIZE
+   MADD  y2, a2, x1, y2
+   LD a2,  AO2,  4 * SIZE
+   MADD  y3, a3, x2, y3
+   LD a3,  AO1,  5 * SIZE
+   MADD  y4, a4, x2, y4
+   LD a4,  AO2,  5 * SIZE
+   LD x1,  XX,   8 * SIZE
+   LD x2,  XX,   9 * SIZE
+   MADD  y1, a5, x3, y1
+   LD a5,  AO1,  6 * SIZE
+   MADD  y2, a6, x3, y2
+   LD a6,  AO2,  6 * SIZE
+   MADD  y3, a7, x4, y3
+   LD a7,  AO1,  7 * SIZE
+   MADD  y4, a8, x4, y4
+   LD a8,  AO2,  7 * SIZE
+   LD x3,  XX,  10 * SIZE
+   LD x4,  XX,  11 * SIZE
+   MADD  y1, a1, x5, y1
+   LD a1,  AO1,  8 * SIZE
+   MADD  y2, a2, x5, y2
+   LD a2,  AO2,  8 * SIZE
+   MADD  y3, a3, x6, y3
+   LD a3,  AO1,  9 * SIZE
+   MADD  y4, a4, x6, y4
+   LD a4,  AO2,  9 * SIZE
+   LD x5,  XX,  12 * SIZE
+   LD x6,  XX,  13 * SIZE
+   MADD  y1, a5, x7, y1
+   LD a5,  AO1, 10 * SIZE
+   MADD  y2, a6, x7, y2
+   LD a6,  AO2, 10 * SIZE
+   MADD  y3, a7, x8, y3
+   LD a7,  AO1, 11 * SIZE
+   MADD  y4, a8, x8, y4
+   LD a8,  AO2, 11 * SIZE
+   LD x7,  XX,  14 * SIZE
+   LD x8,  XX,  15 * SIZE
+   addi.d  I, I, -1
+   addi.d  XX,  XX,   8 * SIZE
+   addi.d  AO1, AO1,  8 * SIZE
+   addi.d AO2, AO2,  8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   MADD  y1, a1, x1, y1
+   LD a1,  AO1,  4 * SIZE
+   MADD  y2, a2, x1, y2
+   LD a2,  AO2,  4 * SIZE
+   MADD  y3, a3, x2, y3
+   LD a3,  AO1,  5 * SIZE
+   MADD  y4, a4, x2, y4
+   LD a4,  AO2,  5 * SIZE
+   MADD  y1, a5, x3, y1
+   LD a5,  AO1,  6 * SIZE
+   MADD  y2, a6, x3, y2
+   LD a6,  AO2,  6 * SIZE
+   MADD  y3, a7, x4, y3
+   LD a7,  AO1,  7 * SIZE
+   MADD  y4, a8, x4, y4
+   LD a8,  AO2,  7 * SIZE
+   MADD  y1, a1, x5, y1
+   MADD  y2, a2, x5, y2
+   MADD  y3, a3, x6, y3
+   MADD  y4, a4, x6, y4
+   MADD  y1, a5, x7, y1
+   addi.d  XX,  XX,   8 * SIZE
+   MADD  y2, a6, x7, y2
+   addi.d  AO1, AO1,  8 * SIZE
+   MADD  y3, a7, x8, y3
+   addi.d  AO2, AO2,  8 * SIZE
+   MADD  y4, a8, x8, y4
+   .align 3
+
+.L15:
+   andi    I,  M, 4
+   bge $r0,    I, .L17
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a2,  AO2,  0 * SIZE
+   LD a3,  AO1,  1 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a4,  AO2,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD x3,  XX,  2 * SIZE
+   MADD  y1, a1, x1, y1
+   LD a6,  AO2,  2 * SIZE
+   MADD  y2, a2, x1, y2
+   LD a7,  AO1,  3 * SIZE
+   MADD  y3, a3, x2, y3
+   LD x4,  XX,  3 * SIZE
+   MADD  y4, a4, x2, y4
+   LD a8,  AO2,  3 * SIZE
+   MADD  y1, a5, x3, y1
+   MADD  y2, a6, x3, y2
+   addi.d  XX,  XX,   4 * SIZE
+   MADD  y3, a7, x4, y3
+   addi.d  AO1, AO1,  4 * SIZE
+   MADD  y4, a8, x4, y4
+   addi.d  AO2, AO2,  4 * SIZE
+   .align 3
+
+.L17:
+   andi    I,  M, 3
+   ADD y1, y1, y3
+   ADD    y2, y2, y4
+   bge $r0,    I, .L19
+   .align  3
+.L18:
+   LD x1,  XX,  0 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a2,  AO2,  0 * SIZE
+   addi.d  I, I, -1
+   addi.d  XX, XX, 1 * SIZE
+   addi.d  AO1, AO1,  1 * SIZE
+   addi.d  AO2, AO2,  1 * SIZE
+   MADD  y1, a1, x1, y1
+   MADD  y2, a2, x1, y2
+   blt $r0,    I, .L18
+   .align 3
+
+.L19:
+   LD a1,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   LD a2,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   MADD  a1, y1, ALPHA, a1
+   addi.d  J, J, -1
+   MADD  a2, y2, ALPHA, a2
+   MTC  y1, $r0
+   ST a1,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST a2,  YY,   0 * SIZE
+   add.d  YY, YY, INCY
+   blt $r0,    J, .L11
+   .align 3
+
+.L20:
+   andi    J,  N, 1
+   MOV y3, y1
+   move   AO1, A
+   bge $r0,    J, .L999
+   srai.d  I,  M, 3
+   move   XX, XORIG
+   bge $r0,    I, .L25
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a3,  AO1,  1 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD x3,  XX,  2 * SIZE
+   LD a7,  AO1,  3 * SIZE
+   LD x4,  XX,  3 * SIZE
+   LD x5,  XX,  4 * SIZE
+   LD x6,  XX,  5 * SIZE
+   LD x7,  XX,  6 * SIZE
+   addi.d  I, I, -1
+   LD x8,  XX,  7 * SIZE
+   bge $r0,    I, .L23
+   .align  3
+.L22:
+   MADD  y1, a1, x1, y1
+   LD a1,  AO1,  4 * SIZE
+   MADD  y3, a3, x2, y3
+   LD a3,  AO1,  5 * SIZE
+   LD x1,  XX,   8 * SIZE
+   LD x2,  XX,   9 * SIZE
+   MADD  y1, a5, x3, y1
+   LD a5,  AO1,  6 * SIZE
+   MADD  y3, a7, x4, y3
+   LD a7,  AO1,  7 * SIZE
+   LD x3,  XX,  10 * SIZE
+   LD x4,  XX,  11 * SIZE
+   MADD  y1, a1, x5, y1
+   LD a1,  AO1,   8 * SIZE
+   MADD  y3, a3, x6, y3
+   LD a3,  AO1,   9 * SIZE
+   LD x5,  XX,  12 * SIZE
+   LD x6,  XX,  13 * SIZE
+   MADD  y1, a5, x7, y1
+   LD a5,  AO1,  10 * SIZE
+   MADD  y3, a7, x8, y3
+   LD a7,  AO1,  11 * SIZE
+   LD x7,  XX,  14 * SIZE
+   LD x8,  XX,  15 * SIZE
+   addi.d  I, I, -1
+   addi.d  XX,  XX,   8 * SIZE
+   addi.d AO1, AO1,  8 * SIZE
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   MADD  y1, a1, x1, y1
+   LD a1,  AO1,  4 * SIZE
+   MADD  y3, a3, x2, y3
+   LD a3,  AO1,  5 * SIZE
+   MADD  y1, a5, x3, y1
+   LD a5,  AO1,  6 * SIZE
+   MADD  y3, a7, x4, y3
+   LD a7,  AO1,  7 * SIZE
+   MADD  y1, a1, x5, y1
+   MADD  y3, a3, x6, y3
+   MADD  y1, a5, x7, y1
+   MADD  y3, a7, x8, y3
+   addi.d  XX,  XX,   8 * SIZE
+   addi.d  AO1, AO1,  8 * SIZE
+   .align 3
+
+.L25:
+   andi    I,  M, 4
+   bge $r0,    I, .L27
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a3,  AO1,  1 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD x3,  XX,  2 * SIZE
+   MADD  y1, a1, x1, y1
+   LD a7,  AO1,  3 * SIZE
+   MADD  y3, a3, x2, y3
+   LD x4,  XX,  3 * SIZE
+   MADD  y1, a5, x3, y1
+   addi.d  XX,  XX,   4 * SIZE
+   MADD  y3, a7, x4, y3
+   addi.d  AO1, AO1,  4 * SIZE
+   .align 3
+
+.L27:
+   andi    I,  M, 3
+   ADD y1, y1, y3
+   bge $r0,    I, .L29
+   .align  3
+.L28:
+   LD x1,  XX,  0 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   addi.d  I, I, -1
+   addi.d  XX, XX, 1 * SIZE
+   addi.d  AO1, AO1,  1 * SIZE
+   MADD  y1, a1, x1, y1
+   blt $r0,    I, .L28
+   .align 3
+
+.L29:
+   LD a1,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   MADD  a1, y1, ALPHA, a1
+   ST a1,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  16
+#endif
+#ifdef __64BIT__
+   addi.d  $sp, $sp, 16
+#else
+   addi.d  $sp, $sp, 32
+#endif
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/iamax.S b/kernel/loongarch64/iamax.S
new file mode 100644
index 000000000..0f9e1bc59
--- /dev/null
+++ b/kernel/loongarch64/iamax.S
@@ -0,0 +1,233 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r18
+#define TEMP   $r7
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+#define x1     $r17
+#define x2     $r8
+#define x3     $r9
+#define x4     $r10
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   li x1, 0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   li x1, 1
+   bge $r0,    N, .L999
+   FABS    s1, a1
+   add.d   X, X, INCX
+   FABS    s2, a1
+   li  x2, 1
+   FABS    s3, a1
+   srai.d  I, N, 3
+   FABS    s4, a1
+   li  x3, 1
+   li  TEMP, 2
+   li x4, 1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   FABS    t3, a3
+   LD a2,  X,   0 * SIZE
+   FABS    t4, a4
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, t1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, t3
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d  TEMP, TEMP, 4
+   addi.d  I, I, -1
+   FABS    t1, a5
+   LD a5,  X,   0 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+   FABS    t3, a7
+   LD a6,  X,   0 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, t1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, t3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d TEMP, TEMP, 4
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   FABS    t1, a5
+   addi.d  TEMP, TEMP, 4
+   FABS    t2, a6
+   FABS    t3, a7
+   FABS    t4, a8
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d  TEMP, TEMP, 4
+   addi.d  x2, x2, 1
+   addi.d  x3, x3, 2
+   addi.d  x4, x4, 3
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   addi.d  I, I, -1
+   CMPLT   $fcc0, s1, t1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   addi.d TEMP, TEMP, 1
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+   CMOVT  s1,  s1,  s2,  $fcc0
+   MOVT(x1, x2, $fcc0)
+   CMOVT  s3,  s3,  s4,  $fcc1
+   MOVT(x3, x4, $fcc1)
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   MOVT(x1, x3, $fcc0)
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/iamin.S b/kernel/loongarch64/iamin.S
new file mode 100644
index 000000000..7751a9d03
--- /dev/null
+++ b/kernel/loongarch64/iamin.S
@@ -0,0 +1,233 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r18
+#define TEMP   $r7
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+#define x1     $r17
+#define x2     $r8
+#define x3     $r9
+#define x4     $r10
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   li x1, 0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   li x1, 1
+   bge $r0,    N, .L999
+   FABS    s1, a1
+   add.d   X, X, INCX
+   FABS    s2, a1
+   li  x2, 1
+   FABS    s3, a1
+   srai.d  I, N, 3
+   FABS    s4, a1
+   li  x3, 1
+   li  TEMP, 2
+   li x4, 1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   FABS    t3, a3
+   LD a2,  X,   0 * SIZE
+   FABS    t4, a4
+   add.d   X, X, INCX
+   CMPLT   $fcc0, t1, s1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, t2, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, t3, s3
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, t4, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d  TEMP, TEMP, 4
+   addi.d  I, I, -1
+   FABS    t1, a5
+   LD a5,  X,   0 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+   FABS    t3, a7
+   LD a6,  X,   0 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   CMPLT   $fcc0, t1, s1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, t2, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, t3, s3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, t4, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d TEMP, TEMP, 4
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   CMPLT   $fcc0, t1, s1
+   CMPLT   $fcc1, t2, s2
+   CMPLT   $fcc2, t3, s3
+   CMPLT   $fcc3, t4, s4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   FABS    t1, a5
+   addi.d  TEMP, TEMP, 4
+   FABS    t2, a6
+   FABS    t3, a7
+   FABS    t4, a8
+   CMPLT   $fcc0, t1, s1
+   CMPLT   $fcc1, t2, s2
+   CMPLT   $fcc2, t3, s3
+   CMPLT   $fcc3, t4, s4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d  TEMP, TEMP, 4
+   addi.d  x2, x2, 1
+   addi.d  x3, x3, 2
+   addi.d  x4, x4, 3
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   addi.d  I, I, -1
+   CMPLT   $fcc0, t1, s1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   addi.d TEMP, TEMP, 1
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s2, s1
+   CMPLT   $fcc1, s4, s3
+   CMOVT  s1,  s1,  s2,  $fcc0
+   MOVT(x1, x2, $fcc0)
+   CMOVT  s3,  s3,  s4,  $fcc1
+   MOVT(x3, x4, $fcc1)
+   CMPLT   $fcc0, s3, s1
+   CMOVT  s1,  s1,  s3,  $fcc0
+   MOVT(x1, x3, $fcc0)
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/izamax.S b/kernel/loongarch64/izamax.S
new file mode 100644
index 000000000..6d7cb9e30
--- /dev/null
+++ b/kernel/loongarch64/izamax.S
@@ -0,0 +1,217 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r18
+#define TEMP   $r7
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define t5     $f4
+#define t6     $f5
+#define t7     $f6
+#define t8     $f7
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+#define x1     $r17
+#define x2     $r8
+#define x3     $r9
+#define x4     $r10
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+      LDINT   N,     0(N)
+      LDINT   INCX,  0(INCX)
+#endif
+
+   li x1, 0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD s1, t1, t2
+   ADD s2, t1, t2
+   ADD s3, t1, t2
+   ADD s4, t1, t2
+   addi.d  N, N, -1
+   li x1, 1
+   bge $r0,    N, .L999
+   add.d   X, X, INCX
+   li  x2, 1
+   srai.d  I, N, 2
+   li  x3, 1
+   li  TEMP, 2
+   li x4, 1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   LD a2,  X,   1 * SIZE
+   FABS    t3, a3
+   add.d   X, X, INCX
+   FABS    t4, a4
+   FABS    t5, a5
+   LD a3,  X,   0 * SIZE
+   FABS    t6, a6
+   LD a4,  X,   1 * SIZE
+   FABS    t7, a7
+   add.d   X, X, INCX
+   FABS    t8, a8
+   ADD t1, t1, t2
+   LD a5,  X,   0 * SIZE
+   ADD t3, t3, t4
+   LD a6,  X,   1 * SIZE
+   ADD t5, t5, t6
+   add.d   X, X, INCX
+   ADD t7, t7, t8
+   CMPLT   $fcc0, s1, t1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t3
+   LD a8,  X,   1 * SIZE
+   CMPLT   $fcc2, s3, t5
+   add.d   X, X, INCX
+   CMPLT   $fcc3, s4, t7
+   addi.d  I, I, -1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t3,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t5,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t7,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d TEMP, TEMP, 4
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   FABS    t5, a5
+   FABS    t6, a6
+   FABS    t7, a7
+   FABS    t8, a8
+   ADD t1, t1, t2
+   ADD t3, t3, t4
+   ADD t5, t5, t6
+   ADD t7, t7, t8
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t3
+   CMPLT   $fcc2, s3, t5
+   CMPLT   $fcc3, s4, t7
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t3,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t5,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t7,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d  TEMP, TEMP, 4
+   addi.d  x2, x2, 1
+   addi.d  x3, x3, 2
+   addi.d  x4, x4, 3
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD t1, t1, t2
+   addi.d  I, I, -1
+   CMPLT   $fcc0, s1, t1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   addi.d TEMP, TEMP, 1
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+   CMOVT  s1,  s1,  s2,  $fcc0
+   MOVT(x1, x2, $fcc0)
+   CMOVT  s3,  s3,  s4,  $fcc1
+   MOVT(x3, x4, $fcc1)
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   MOVT(x1, x3, $fcc0)
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/izamin.S b/kernel/loongarch64/izamin.S
new file mode 100644
index 000000000..998927985
--- /dev/null
+++ b/kernel/loongarch64/izamin.S
@@ -0,0 +1,217 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r18
+#define TEMP   $r7
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define t5     $f4
+#define t6     $f5
+#define t7     $f6
+#define t8     $f7
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+#define x1     $r17
+#define x2     $r8
+#define x3     $r9
+#define x4     $r10
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   li x1, 0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD s1, t1, t2
+   ADD s2, t1, t2
+   ADD s3, t1, t2
+   ADD s4, t1, t2
+   addi.d  N, N, -1
+   li x1, 1
+   bge $r0,    N, .L999
+   add.d   X, X, INCX
+   li  x2, 1
+   srai.d  I, N, 2
+   li  x3, 1
+   li  TEMP, 2
+   li x4, 1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   LD a2,  X,   1 * SIZE
+   FABS    t3, a3
+   add.d   X, X, INCX
+   FABS    t4, a4
+   FABS    t5, a5
+   LD a3,  X,   0 * SIZE
+   FABS    t6, a6
+   LD a4,  X,   1 * SIZE
+   FABS    t7, a7
+   add.d   X, X, INCX
+   FABS    t8, a8
+   ADD t1, t1, t2
+   LD a5,  X,   0 * SIZE
+   ADD t3, t3, t4
+   LD a6,  X,   1 * SIZE
+   ADD t5, t5, t6
+   add.d   X, X, INCX
+   ADD t7, t7, t8
+   CMPLT   $fcc0, t1, s1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, t3, s2
+   LD a8,  X,   1 * SIZE
+   CMPLT   $fcc2, t5, s3
+   add.d   X, X, INCX
+   CMPLT   $fcc3, t7, s4
+   addi.d  I, I, -1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t3,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t5,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t7,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d TEMP, TEMP, 4
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   FABS    t5, a5
+   FABS    t6, a6
+   FABS    t7, a7
+   FABS    t8, a8
+   ADD t1, t1, t2
+   ADD t3, t3, t4
+   ADD t5, t5, t6
+   ADD t7, t7, t8
+   CMPLT   $fcc0, t1, s1
+   CMPLT   $fcc1, t3, s2
+   CMPLT   $fcc2, t5, s3
+   CMPLT   $fcc3, t7, s4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t3,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t5,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t7,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d  TEMP, TEMP, 4
+   addi.d  x2, x2, 1
+   addi.d  x3, x3, 2
+   addi.d  x4, x4, 3
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD t1, t1, t2
+   addi.d  I, I, -1
+   CMPLT   $fcc0, t1, s1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   addi.d TEMP, TEMP, 1
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s2, s1
+   CMPLT   $fcc1, s4, s3
+   CMOVT  s1,  s1,  s2,  $fcc0
+   MOVT(x1, x2, $fcc0)
+   CMOVT  s3,  s3,  s4,  $fcc1
+   MOVT(x3, x4, $fcc1)
+   CMPLT   $fcc0, s3, s1
+   CMOVT  s1,  s1,  s3,  $fcc0
+   MOVT(x1, x3, $fcc0)
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/max.S b/kernel/loongarch64/max.S
new file mode 100644
index 000000000..56c3f99a1
--- /dev/null
+++ b/kernel/loongarch64/max.S
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+      LDINT   N,     0(N)
+      LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD s1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   add.d   X, X, INCX
+   MOV    s2, s1
+   bge $r0,    N, .L999
+   MOV s3, s1
+   srai.d  I, N, 3
+   MOV    s4, s1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   CMPLT   $fcc0, s1, a1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, a2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, a3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, a4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  a1,  $fcc0
+   LD a1,  X,   0 * SIZE
+   CMOVT  s2,  s2,  a2,  $fcc1
+   add.d   X, X, INCX
+   CMOVT  s3,  s3,  a3,  $fcc2
+   LD a2,  X,   0 * SIZE
+   CMOVT  s4,  s4,  a4,  $fcc3
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, a5
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, a6
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, a7
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, a8
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  a5,  $fcc0
+   LD a5,  X,   0 * SIZE
+   CMOVT  s2,  s2,  a6,  $fcc1
+   add.d   X, X, INCX
+   CMOVT  s3,  s3,  a7,  $fcc2
+   LD a6,  X,   0 * SIZE
+   CMOVT  s4,  s4,  a8,  $fcc3
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   CMPLT   $fcc0, s1, a1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, a2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, a3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, a4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  a1,  $fcc0
+   CMOVT  s2,  s2,  a2,  $fcc1
+   CMOVT  s3,  s3,  a3,  $fcc2
+   CMOVT  s4,  s4,  a4,  $fcc3
+   CMPLT   $fcc0, s1, a5
+   CMPLT   $fcc1, s2, a6
+   CMPLT   $fcc2, s3, a7
+   CMPLT   $fcc3, s4, a8
+   CMOVT  s1,  s1,  a5,  $fcc0
+   CMOVT  s2,  s2,  a6,  $fcc1
+   CMOVT  s3,  s3,  a7,  $fcc2
+   CMOVT  s4,  s4,  a8,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   CMPLT   $fcc0, s1, a1
+   CMOVT  s1,  s1,  a1,  $fcc0
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/min.S b/kernel/loongarch64/min.S
new file mode 100644
index 000000000..bb2fcfb01
--- /dev/null
+++ b/kernel/loongarch64/min.S
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD s1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   add.d   X, X, INCX
+   MOV    s2, s1
+   bge $r0,    N, .L999
+   MOV s3, s1
+   srai.d  I, N, 3
+   MOV    s4, s1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   CMPLT   $fcc0, a1, s1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, a2, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, a3, s3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, a4, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  a1,  $fcc0
+   LD a1,  X,   0 * SIZE
+   CMOVT  s2,  s2,  a2,  $fcc1
+   add.d   X, X, INCX
+   CMOVT  s3,  s3,  a3,  $fcc2
+   LD a2,  X,   0 * SIZE
+   CMOVT  s4,  s4,  a4,  $fcc3
+   add.d   X, X, INCX
+   CMPLT   $fcc0, a5, s1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, a6, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, a7, s3
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, a8, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  a5,  $fcc0
+   LD a5,  X,   0 * SIZE
+   CMOVT  s2,  s2,  a6,  $fcc1
+   add.d   X, X, INCX
+   CMOVT  s3,  s3,  a7,  $fcc2
+   LD a6,  X,   0 * SIZE
+   CMOVT  s4,  s4,  a8,  $fcc3
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   CMPLT   $fcc0, a1, s1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, a2, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, a3, s3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, a4, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  a1,  $fcc0
+   CMOVT  s2,  s2,  a2,  $fcc1
+   CMOVT  s3,  s3,  a3,  $fcc2
+   CMOVT  s4,  s4,  a4,  $fcc3
+   CMPLT   $fcc0, a5, s1
+   CMPLT   $fcc1, a6, s2
+   CMPLT   $fcc2, a7, s3
+   CMPLT   $fcc3, a8, s4
+   CMOVT  s1,  s1,  a5,  $fcc0
+   CMOVT  s2,  s2,  a6,  $fcc1
+   CMOVT  s3,  s3,  a7,  $fcc2
+   CMOVT  s4,  s4,  a8,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   CMPLT   $fcc0, a1, s1
+   CMOVT  s1,  s1,  a1,  $fcc0
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s2, s1
+   CMPLT   $fcc1, s4, s3
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s3, s1
+   CMOVT  s1,  s1,  s3,  $fcc0
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S
new file mode 100644
index 000000000..7399e57b3
--- /dev/null
+++ b/kernel/loongarch64/scal.S
@@ -0,0 +1,330 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define ALPHA  $f0
+#define a1     $f22
+#define a2     $f8
+#define a3     $f23
+#define a4     $f9
+#define a5     $f10
+#define a6     $f11
+#define a7     $f12
+#define a8     $f13
+#define t1     $f14
+#define t2     $f15
+#define t3     $f16
+#define t4     $f17
+
+   PROLOGUE
+
+   li  TEMP, SIZE
+   MTC  a1, $r0
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    N, .L999
+   CMPEQ   $fcc0, ALPHA, a1
+   bceqz   $fcc0, .L50
+   srai.d I, N, 3
+   bne INCX, TEMP, .L20
+   bge $r0,    I, .L15
+   .align 3
+
+.L12:
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   ST a1,  X,   2 * SIZE
+   ST a1,  X,   3 * SIZE
+   ST a1,  X,   4 * SIZE
+   ST a1,  X,   5 * SIZE
+   ST a1,  X,   6 * SIZE
+   ST a1,  X,   7 * SIZE
+   addi.w  I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L16:
+   ST a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   addi.d X, X, SIZE
+   blt $r0,    I, .L16
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L20:
+   srai.d  I, N, 3
+   bge $r0,    I, .L25
+   .align 3
+
+.L22:
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L22
+   .align 3
+
+.L25:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L26:
+   addi.d  I, I, -1
+   ST a1,  X,   0 * SIZE
+   add.d  X, X, INCX
+   blt $r0,    I, .L26
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L50:
+   srai.d I, N, 3
+   bne INCX, TEMP, .L60
+   addi.d I, I, -1
+   blt I,  $r0, .L55
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   LD a6,  X,   5 * SIZE
+   LD a7,  X,   6 * SIZE
+   LD a8,  X,   7 * SIZE
+   bge $r0,    I, .L53
+   .align 3
+
+.L52:
+   MUL t1, ALPHA, a1
+   LD a1,  X,   8 * SIZE
+   MUL t2, ALPHA, a2
+   LD a2,  X,   9 * SIZE
+   MUL t3, ALPHA, a3
+   LD a3,  X,  10 * SIZE
+   MUL t4, ALPHA, a4
+   LD a4,  X,  11 * SIZE
+   ST t1,  X,   0 * SIZE
+   MUL t1, ALPHA, a5
+   LD a5,  X,  12 * SIZE
+   ST t2,  X,   1 * SIZE
+   MUL t2, ALPHA, a6
+   LD a6,  X,  13 * SIZE
+   ST t3,  X,   2 * SIZE
+   MUL t3, ALPHA, a7
+   LD a7,  X,  14 * SIZE
+   ST t4,  X,   3 * SIZE
+   MUL t4, ALPHA, a8
+   LD a8,  X,  15 * SIZE
+   addi.d  I, I, -1
+   ST t1,  X,   4 * SIZE
+   ST t2,  X,   5 * SIZE
+   ST t3,  X,   6 * SIZE
+   ST t4,  X,   7 * SIZE
+   addi.d X, X, 8 * SIZE
+   blt $r0,    I, .L52
+   .align 3
+
+.L53:
+   MUL t1, ALPHA, a1
+   MUL t2, ALPHA, a2
+   MUL t3, ALPHA, a3
+   MUL t4, ALPHA, a4
+   ST t1,  X,   0 * SIZE
+   MUL t1, ALPHA, a5
+   ST t2,  X,   1 * SIZE
+   MUL t2, ALPHA, a6
+   ST t3,  X,   2 * SIZE
+   MUL t3, ALPHA, a7
+   ST t4,  X,   3 * SIZE
+   MUL t4, ALPHA, a8
+   ST t1,  X,   4 * SIZE
+   ST t2,  X,   5 * SIZE
+   ST t3,  X,   6 * SIZE
+   ST t4,  X,   7 * SIZE
+   addi.d  X, X, 8 * SIZE
+   .align 3
+
+.L55:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L56:
+   LD a1,  X,   0 * SIZE
+   MUL t1, ALPHA, a1
+   addi.d  X, X, SIZE
+   addi.d  I, I, -1
+   ST t1,  X,  -1 * SIZE
+   blt $r0,    I, .L56
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L60:
+   srai.d  I, N, 3
+   move    XX, X
+   addi.d I, I, -1
+   blt I,  $r0, .L65
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   add.d   X, X, INCX
+   bge $r0,    I, .L63
+   .align 3
+
+.L62:
+   MUL t1, ALPHA, a1
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   MUL t2, ALPHA, a2
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   MUL t3, ALPHA, a3
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   MUL t4, ALPHA, a4
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST t1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t4,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   MUL t1, ALPHA, a5
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   MUL t2, ALPHA, a6
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   MUL t3, ALPHA, a7
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   MUL t4, ALPHA, a8
+   LD a8,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST t1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t4,  XX,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  XX, XX, INCX
+   blt $r0,    I, .L62
+   .align 3
+
+.L63:
+   MUL t1, ALPHA, a1
+   MUL t2, ALPHA, a2
+   MUL t3, ALPHA, a3
+   MUL t4, ALPHA, a4
+   ST t1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t4,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   MUL t1, ALPHA, a5
+   MUL t2, ALPHA, a6
+   MUL t3, ALPHA, a7
+   MUL t4, ALPHA, a8
+   ST t1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t4,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   .align 3
+
+.L65:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L66:
+   LD a1,  X,   0 * SIZE
+   MUL t1, ALPHA, a1
+   addi.d  I, I, -1
+   ST t1,  X,   0 * SIZE
+   add.d  X, X, INCX
+   blt $r0,    I, .L66
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S
new file mode 100644
index 000000000..14b62cfe7
--- /dev/null
+++ b/kernel/loongarch64/snrm2.S
@@ -0,0 +1,249 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define a5     $f16
+#define a6     $f17
+#define a7     $f0
+#define a8     $f1
+#define s1     $f22
+#define s2     $f8
+#define t1     $f23
+#define t2     $f9
+#define t3     $f10
+#define t4     $f11
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   movgr2fr.d  s1, $r0
+   li  TEMP, SIZE
+   fmov.d s2, s1
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   srai.d I, N, 3
+   bne INCX, TEMP, .L20
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   addi.d  I, I, -1
+   fcvt.d.s    t1, a1
+   LD a6,  X,   5 * SIZE
+   fcvt.d.s    t2, a2
+   LD a7,  X,   6 * SIZE
+   fcvt.d.s    t3, a3
+   LD a8,  X,   7 * SIZE
+   fcvt.d.s   t4, a4
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   fmadd.d  s1, t1, t1, s1
+   LD a1,  X,   8 * SIZE
+   fcvt.d.s    t1, a5
+   NOP
+   fmadd.d  s2, t2, t2, s2
+   LD a2,  X,   9 * SIZE
+   fcvt.d.s    t2, a6
+   NOP
+   fmadd.d  s1, t3, t3, s1
+   LD a3,  X,  10 * SIZE
+   fcvt.d.s    t3, a7
+   NOP
+   fmadd.d  s2, t4, t4, s2
+   LD a4,  X,  11 * SIZE
+   fcvt.d.s    t4, a8
+   NOP
+   fmadd.d  s1, t1, t1, s1
+   LD a5,  X,  12 * SIZE
+   fcvt.d.s    t1, a1
+   NOP
+   fmadd.d  s2, t2, t2, s2
+   LD a6,  X,  13 * SIZE
+   fcvt.d.s    t2, a2
+   addi.d  I, I, -1
+   fmadd.d  s1, t3, t3, s1
+   LD a7,  X,  14 * SIZE
+   fcvt.d.s    t3, a3
+   addi.d  X, X, 8 * SIZE
+   fmadd.d  s2, t4, t4, s2
+   LD a8,  X,   7 * SIZE
+   fcvt.d.s   t4, a4
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   fmadd.d  s1, t1, t1, s1
+   fcvt.d.s    t1, a5
+   fmadd.d  s2, t2, t2, s2
+   fcvt.d.s    t2, a6
+   fmadd.d  s1, t3, t3, s1
+   fcvt.d.s    t3, a7
+   fmadd.d  s2, t4, t4, s2
+   fcvt.d.s    t4, a8
+   fmadd.d  s1, t1, t1, s1
+   fmadd.d  s2, t2, t2, s2
+   fmadd.d  s1, t3, t3, s1
+   fmadd.d  s2, t4, t4, s2
+   addi.d  X, X, 8 * SIZE
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   fcvt.d.s    t1, a1
+   fmadd.d  s1, t1, t1, s1
+   addi.d X, X, SIZE
+   blt $r0,    I, .L16
+   b   .L999
+   .align 3
+
+.L20:
+   bge $r0,    I, .L25
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+   fcvt.d.s    t1, a1
+   fcvt.d.s    t2, a2
+   fcvt.d.s    t3, a3
+   fcvt.d.s    t4, a4
+   add.d  X, X, INCX
+   bge $r0,    I, .L24
+   .align 3
+
+.L23:
+   fmadd.d  s1, t1, t1, s1
+   LD a1,  X,   0 * SIZE
+   fcvt.d.s    t1, a5
+   add.d   X, X, INCX
+   fmadd.d  s2, t2, t2, s2
+   LD a2,  X,   0 * SIZE
+   fcvt.d.s    t2, a6
+   add.d   X, X, INCX
+   fmadd.d  s1, t3, t3, s1
+   LD a3,  X,   0 * SIZE
+   fcvt.d.s    t3, a7
+   add.d   X, X, INCX
+   fmadd.d  s2, t4, t4, s2
+   LD a4,  X,   0 * SIZE
+   fcvt.d.s    t4, a8
+   add.d   X, X, INCX
+   fmadd.d  s1, t1, t1, s1
+   LD a5,  X,   0 * SIZE
+   fcvt.d.s    t1, a1
+   add.d   X, X, INCX
+   fmadd.d  s2, t2, t2, s2
+   LD a6,  X,   0 * SIZE
+   fcvt.d.s    t2, a2
+   add.d   X, X, INCX
+   fmadd.d  s1, t3, t3, s1
+   LD a7,  X,   0 * SIZE
+   fcvt.d.s    t3, a3
+   add.d   X, X, INCX
+   fmadd.d  s2, t4, t4, s2
+   LD a8,  X,   0 * SIZE
+   fcvt.d.s    t4, a4
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L23
+   .align 3
+
+.L24:
+   fmadd.d  s1, t1, t1, s1
+   fcvt.d.s    t1, a5
+   fmadd.d  s2, t2, t2, s2
+   fcvt.d.s    t2, a6
+   fmadd.d  s1, t3, t3, s1
+   fcvt.d.s    t3, a7
+   fmadd.d  s2, t4, t4, s2
+   fcvt.d.s    t4, a8
+   fmadd.d  s1, t1, t1, s1
+   fmadd.d  s2, t2, t2, s2
+   fmadd.d  s1, t3, t3, s1
+   fmadd.d  s2, t4, t4, s2
+   .align 3
+
+.L25:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+
+.L26:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   fcvt.d.s    t1, a1
+   add.d   X, X, INCX
+   fmadd.d  s1, t1, t1, s1
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+   fadd.d  s1, s1, s2
+   fsqrt.d s1, s1
+   move $r4, $r17
+   fcvt.s.d    $f0, s1
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/swap.S b/kernel/loongarch64/swap.S
new file mode 100644
index 000000000..c9d8f7fc1
--- /dev/null
+++ b/kernel/loongarch64/swap.S
@@ -0,0 +1,330 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define a1     $f22
+#define a2     $f8
+#define a3     $f23
+#define a4     $f9
+#define a5     $f10
+#define a6     $f11
+#define a7     $f12
+#define a8     $f13
+#define b1     $f14
+#define b2     $f15
+#define b3     $f16
+#define b4     $f17
+#define b5     $f0
+#define b6     $f1
+#define b7     $f2
+#define b8     $f3
+
+   PROLOGUE
+
+   li  TEMP, SIZE
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    N, .L999
+   slli.d INCY, INCY, BASE_SHIFT
+   bne INCX, TEMP, .L20
+   srai.d I, N, 3
+   bne INCY, TEMP, .L20
+   addi.d I, I, -1
+   blt I,  $r0, .L15
+   LD a1,  X,   0 * SIZE
+   LD b1,  Y,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD b2,  Y,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD b3,  Y,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD b4,  Y,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   LD b5,  Y,   4 * SIZE
+   LD a6,  X,   5 * SIZE
+   LD b6,  Y,   5 * SIZE
+   LD a7,  X,   6 * SIZE
+   LD b7,  Y,   6 * SIZE
+   LD a8,  X,   7 * SIZE
+   LD b8,  Y,   7 * SIZE
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   ST a1,  Y,   0 * SIZE
+   LD a1,  X,   8 * SIZE
+   ST b1,  X,   0 * SIZE
+   LD b1,  Y,   8 * SIZE
+   ST a2,  Y,   1 * SIZE
+   LD a2,  X,   9 * SIZE
+   ST b2,  X,   1 * SIZE
+   LD b2,  Y,   9 * SIZE
+   ST a3,  Y,   2 * SIZE
+   LD a3,  X,  10 * SIZE
+   ST b3,  X,   2 * SIZE
+   LD b3,  Y,  10 * SIZE
+   ST a4,  Y,   3 * SIZE
+   LD a4,  X,  11 * SIZE
+   ST b4,  X,   3 * SIZE
+   LD b4,  Y,  11 * SIZE
+   ST a5,  Y,   4 * SIZE
+   LD a5,  X,  12 * SIZE
+   ST b5,  X,   4 * SIZE
+   LD b5,  Y,  12 * SIZE
+   ST a6,  Y,   5 * SIZE
+   LD a6,  X,  13 * SIZE
+   ST b6,  X,   5 * SIZE
+   LD b6,  Y,  13 * SIZE
+   ST a7,  Y,   6 * SIZE
+   LD a7,  X,  14 * SIZE
+   ST b7,  X,   6 * SIZE
+   LD b7,  Y,  14 * SIZE
+   ST a8,  Y,   7 * SIZE
+   LD a8,  X,  15 * SIZE
+   ST b8,  X,   7 * SIZE
+   LD b8,  Y,  15 * SIZE
+   addi.d  I, I, -1
+   addi.d  X, X, 8 * SIZE
+   addi.d Y, Y, 8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   ST a1,  Y,   0 * SIZE
+   ST b1,  X,   0 * SIZE
+   ST a2,  Y,   1 * SIZE
+   ST b2,  X,   1 * SIZE
+   ST a3,  Y,   2 * SIZE
+   ST b3,  X,   2 * SIZE
+   ST a4,  Y,   3 * SIZE
+   ST b4,  X,   3 * SIZE
+   ST a5,  Y,   4 * SIZE
+   ST b5,  X,   4 * SIZE
+   ST a6,  Y,   5 * SIZE
+   ST b6,  X,   5 * SIZE
+   ST a7,  Y,   6 * SIZE
+   ST b7,  X,   6 * SIZE
+   ST a8,  Y,   7 * SIZE
+   ST b8,  X,   7 * SIZE
+   addi.d  X, X, 8 * SIZE
+   addi.d  Y, Y, 8 * SIZE
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD b1,  Y,   0 * SIZE
+   addi.d  X, X, SIZE
+   addi.d  I, I, -1
+   addi.d  Y, Y, SIZE
+   ST b1,  X,  -1 * SIZE
+   ST a1,  Y,  -1 * SIZE
+   blt $r0,    I, .L16
+   b   .L999
+   .align 3
+
+.L20:
+   srai.d  I, N, 3
+   move    XX, X
+   move    YY, Y
+   addi.d I, I, -1
+   blt I,  $r0, .L25
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b2,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b3,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b4,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b5,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b6,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b7,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a8,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b8,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   bge $r0,    I, .L23
+   .align 3
+
+.L22:
+   ST a1,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a2,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b2,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a3,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b3,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a4,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b4,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b4,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a5,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b5,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b5,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a6,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b6,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b6,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a7,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b7,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b7,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a8,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a8,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b8,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b8,  Y,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   ST a1,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a2,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a3,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a4,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b4,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a5,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b5,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a6,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b6,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a7,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b7,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a8,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b8,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   .align 3
+
+.L25:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L26:
+   LD a1,  X,   0 * SIZE
+   LD b1,  Y,   0 * SIZE
+   addi.d  I, I, -1
+   ST a1,  Y,   0 * SIZE
+   ST b1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/trsm_kernel_LN.S b/kernel/loongarch64/trsm_kernel_LN.S
new file mode 100644
index 000000000..a0bd29f3b
--- /dev/null
+++ b/kernel/loongarch64/trsm_kernel_LN.S
@@ -0,0 +1,2863 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+#define OFFSET $r11
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r29
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define CO5    $r25
+#define CO6    $r26
+#define CO7    $r27
+#define CO8    $r28
+#define KK     $r30
+#define TEMP   $r20
+#define AORIG  $r16
+#define a1     $f22
+#define a2     $f8
+#define a3     $f27
+#define a4     $f28
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f3
+#define c22    $f1
+#define c31    $f2
+#define c32    $f4
+#define c41    $f5
+#define c42    $f6
+#define c51    $f7
+#define c52    $f18
+#define c61    $f19
+#define c62    $f20
+#define c71    $f21
+#define c72    $f24
+#define c81    $f25
+#define c82    $f26
+#define ALPHA  $f0
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -144
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   fst.d  $f24,  $sp,  48
+   fst.d  $f25,  $sp,  56
+   fst.d  $f26,  $sp,  64
+   fst.d  $f27,  $sp,  72
+   fst.d  $f28,  $sp,  80
+   SDARG  $r29,  $sp,   88
+   SDARG  $r30,  $sp,   96
+   SDARG  $r20,  $sp,  104
+   SDARG  $r16,  $sp,  112
+#ifndef __64BIT__
+   fst.d  $f18,  $sp, 112
+   fst.d  $f19,  $sp, 120
+   fst.d  $f20,  $sp, 128
+   fst.d  $f21,  $sp, 136
+#endif
+   slli.d     LDC,    LDC, BASE_SHIFT
+#ifdef LN
+        mul.w   TEMP, M, K
+   slli.d  TEMP, TEMP, BASE_SHIFT
+   add.d   A, A, TEMP
+   slli.d  TEMP, M, BASE_SHIFT
+   add.d   C, C, TEMP
+#endif
+#ifdef RN
+   neg KK, OFFSET
+#endif
+#ifdef RT
+        mul.w   TEMP, N, K
+   slli.d  TEMP, TEMP, BASE_SHIFT
+   add.d   B, B, TEMP
+        mul.w   TEMP, N,    LDC
+   add.d   C, C, TEMP
+   sub.d   KK, N, OFFSET
+#endif
+   srai.d  J,  N, 3
+nop
+   bge $r0,    J, .L30
+.L10:
+#ifdef RT
+   slli.d  TEMP, K, 3 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 3
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   CO5, CO4,    LDC
+   MOV c31, c11
+   add.d   CO6, CO5,    LDC
+   MOV c41, c11
+   add.d   CO7, CO6,    LDC
+   MOV c51, c11
+   add.d   CO8, CO7,    LDC
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO8,    LDC
+#endif
+   andi    I,  M, 1
+   MOV c61, c11
+MOV    c71, c11
+   bge $r0,    I, .L20
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+   MOV c81, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  0 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   MOV c81, c11
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,  20 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   9 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,  10 * SIZE
+   MADD  c81, b4, a1, c81
+   LD b4,  BO,  11 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a2, c51
+   LD b7,  BO,  28 * SIZE
+   MADD  c61, b2, a2, c61
+   LD b2,  BO,  17 * SIZE
+   MADD  c71, b3, a2, c71
+   LD b3,  BO,  18 * SIZE
+   MADD  c81, b4, a2, c81
+   LD b4,  BO,  19 * SIZE
+   LD a2,  AO,   5 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,  32 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,  21 * SIZE
+   MADD  c31, b3, a3, c31
+   LD b3,  BO,  22 * SIZE
+   MADD  c41, b4, a3, c41
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD b5,  BO,  36 * SIZE
+   MADD  c61, b2, a3, c61
+   LD b2,  BO,  25 * SIZE
+   MADD  c71, b3, a3, c71
+   LD b3,  BO,  26 * SIZE
+   MADD  c81, b4, a3, c81
+   LD b4,  BO,  27 * SIZE
+   LD a3,  AO,   2 * SIZE
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c11, b6, a4, c11
+   LD b6,  BO,   8 * SIZE
+   MADD  c21, b2, a4, c21
+   LD b2,  BO,  -3 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  -2 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  -1 * SIZE
+   MADD  c51, b7, a4, c51
+   LD b7,  BO,  12 * SIZE
+   MADD  c61, b2, a4, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a4, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a4, c81
+   LD b4,  BO,   3 * SIZE
+   LD a4,  AO,   3 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   MOV a2, a2
+   addi.d  AO, AO,  1 * SIZE
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,   4 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L26
+.L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -8
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c51, b5, c51
+   SUB c61, b6, c61
+   SUB c71, b7, c71
+   SUB c81, b8, c81
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c51, b5, c51
+   SUB c61, b6, c61
+   SUB c71, b7, c71
+   SUB c81, b8, c81
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   MUL c51, b1, c51
+   MUL c61, b1, c61
+   MUL c71, b1, c71
+   MUL c81, b1, c81
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c51, c11, b5, c51
+   NMSUB  c61, c11, b6, c61
+   NMSUB  c71, c11, b7, c71
+   NMSUB  c81, c11, b8, c81
+   LD b2,  BO,   9 * SIZE
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   LD b5,  BO,  12 * SIZE
+   LD b6,  BO,  13 * SIZE
+   LD b7,  BO,  14 * SIZE
+   LD b8,  BO,  15 * SIZE
+   MUL c21, b2, c21
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c51, c21, b5, c51
+   NMSUB  c61, c21, b6, c61
+   NMSUB  c71, c21, b7, c71
+   NMSUB  c81, c21, b8, c81
+   LD b3,  BO,  18 * SIZE
+   LD b4,  BO,  19 * SIZE
+   LD b5,  BO,  20 * SIZE
+   LD b6,  BO,  21 * SIZE
+   LD b7,  BO,  22 * SIZE
+   LD b8,  BO,  23 * SIZE
+   MUL c31, b3, c31
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c51, c31, b5, c51
+   NMSUB  c61, c31, b6, c61
+   NMSUB  c71, c31, b7, c71
+   NMSUB  c81, c31, b8, c81
+   LD b4,  BO,  27 * SIZE
+   LD b5,  BO,  28 * SIZE
+   LD b6,  BO,  29 * SIZE
+   LD b7,  BO,  30 * SIZE
+   LD b8,  BO,  31 * SIZE
+   MUL c41, b4, c41
+   NMSUB  c51, c41, b5, c51
+   NMSUB  c61, c41, b6, c61
+   NMSUB  c71, c41, b7, c71
+   NMSUB  c81, c41, b8, c81
+   LD b5,  BO,  36 * SIZE
+   LD b6,  BO,  37 * SIZE
+   LD b7,  BO,  38 * SIZE
+   LD b8,  BO,  39 * SIZE
+   MUL c51, b5, c51
+   NMSUB  c61, c51, b6, c61
+   NMSUB  c71, c51, b7, c71
+   NMSUB  c81, c51, b8, c81
+   LD b6,  BO,  45 * SIZE
+   LD b7,  BO,  46 * SIZE
+   LD b8,  BO,  47 * SIZE
+   MUL c61, b6, c61
+   NMSUB  c71, c61, b7, c71
+   NMSUB  c81, c61, b8, c81
+   LD b7,  BO,  54 * SIZE
+   LD b8,  BO,  55 * SIZE
+   MUL c71, b7, c71
+   NMSUB  c81, c71, b8, c81
+   LD b8,  BO,  63 * SIZE
+   MUL c81, b8, c81
+#endif
+#ifdef RT
+   LD b1,  BO,  63 * SIZE
+   LD b2,  BO,  62 * SIZE
+   LD b3,  BO,  61 * SIZE
+   LD b4,  BO,  60 * SIZE
+   LD b5,  BO,  59 * SIZE
+   LD b6,  BO,  58 * SIZE
+   LD b7,  BO,  57 * SIZE
+   LD b8,  BO,  56 * SIZE
+   MUL c81, b1, c81
+   NMSUB  c71, c81, b2, c71
+   NMSUB  c61, c81, b3, c61
+   NMSUB  c51, c81, b4, c51
+   NMSUB  c41, c81, b5, c41
+   NMSUB  c31, c81, b6, c31
+   NMSUB  c21, c81, b7, c21
+   NMSUB  c11, c81, b8, c11
+   LD b2,  BO,  54 * SIZE
+   LD b3,  BO,  53 * SIZE
+   LD b4,  BO,  52 * SIZE
+   LD b5,  BO,  51 * SIZE
+   LD b6,  BO,  50 * SIZE
+   LD b7,  BO,  49 * SIZE
+   LD b8,  BO,  48 * SIZE
+   MUL c71, b2, c71
+   NMSUB  c61, c71, b3, c61
+   NMSUB  c51, c71, b4, c51
+   NMSUB  c41, c71, b5, c41
+   NMSUB  c31, c71, b6, c31
+   NMSUB  c21, c71, b7, c21
+   NMSUB  c11, c71, b8, c11
+   LD b3,  BO,  45 * SIZE
+   LD b4,  BO,  44 * SIZE
+   LD b5,  BO,  43 * SIZE
+   LD b6,  BO,  42 * SIZE
+   LD b7,  BO,  41 * SIZE
+   LD b8,  BO,  40 * SIZE
+   MUL c61, b3, c61
+   NMSUB  c51, c61, b4, c51
+   NMSUB  c41, c61, b5, c41
+   NMSUB  c31, c61, b6, c31
+   NMSUB  c21, c61, b7, c21
+   NMSUB  c11, c61, b8, c11
+   LD b4,  BO,  36 * SIZE
+   LD b5,  BO,  35 * SIZE
+   LD b6,  BO,  34 * SIZE
+   LD b7,  BO,  33 * SIZE
+   LD b8,  BO,  32 * SIZE
+   MUL c51, b4, c51
+   NMSUB  c41, c51, b5, c41
+   NMSUB  c31, c51, b6, c31
+   NMSUB  c21, c51, b7, c21
+   NMSUB  c11, c51, b8, c11
+   LD b5,  BO,  27 * SIZE
+   LD b6,  BO,  26 * SIZE
+   LD b7,  BO,  25 * SIZE
+   LD b8,  BO,  24 * SIZE
+   MUL c41, b5, c41
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c11, c41, b8, c11
+   LD b6,  BO,  18 * SIZE
+   LD b7,  BO,  17 * SIZE
+   LD b8,  BO,  16 * SIZE
+   MUL c31, b6, c31
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c11, c31, b8, c11
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c21, b7, c21
+   NMSUB  c11, c21, b8, c11
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+   addi.d  CO3, CO3, -1 * SIZE
+   addi.d  CO4, CO4, -1 * SIZE
+   addi.d  CO5, CO5, -1 * SIZE
+   addi.d  CO6, CO6, -1 * SIZE
+   addi.d  CO7, CO7, -1 * SIZE
+   addi.d  CO8, CO8, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c61,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c81,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c41,  AO,   3 * SIZE
+   ST c51,  AO,   4 * SIZE
+   ST c61,  AO,   5 * SIZE
+   ST c71,  AO,   6 * SIZE
+   ST c81,  AO,   7 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c81,  CO8,   0 * SIZE
+MTC  c11, $r0
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+   addi.d  CO3, CO3, 1 * SIZE
+   addi.d  CO4, CO4, 1 * SIZE
+   addi.d  CO5, CO5, 1 * SIZE
+   addi.d  CO6, CO6, 1 * SIZE
+   addi.d  CO7, CO7, 1 * SIZE
+   addi.d  CO8, CO8, 1 * SIZE
+#endif
+   MOV c21, c11
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+   MOV c31, c11
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+   MOV c41, c11
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L20:
+   srai.d  I,  M, 1
+   MOV c51, c11
+MOV    c61, c11
+   bge $r0,    I, .L29
+.L11:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  KK, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L15
+#endif
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -8
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   LD b5,  BO,   4 * SIZE
+   SUB c21, b2, c21
+   LD b6,  BO,   5 * SIZE
+   SUB c31, b3, c31
+   LD b7,  BO,   6 * SIZE
+   SUB c41, b4, c41
+   LD b8,  BO,   7 * SIZE
+   SUB c51, b5, c51
+   LD b1,  BO,   8 * SIZE
+   SUB c61, b6, c61
+   LD b2,  BO,   9 * SIZE
+   SUB c71, b7, c71
+   LD b3,  BO,  10 * SIZE
+   SUB c81, b8, c81
+   LD b4,  BO,  11 * SIZE
+   SUB c12, b1, c12
+   LD b5,  BO,  12 * SIZE
+   SUB c22, b2, c22
+   LD b6,  BO,  13 * SIZE
+   SUB c32, b3, c32
+   LD b7,  BO,  14 * SIZE
+   SUB c42, b4, c42
+   LD b8,  BO,  15 * SIZE
+   SUB c52, b5, c52
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+#else
+   LD b1,  AO,   0 * SIZE
+#endif
+   SUB c62, b6, c62
+   SUB c72, b7, c72
+   SUB c82, b8, c82
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   LD b5,  AO,   4 * SIZE
+   SUB c12, b2, c12
+   LD b6,  AO,   5 * SIZE
+   SUB c21, b3, c21
+   LD b7,  AO,   6 * SIZE
+   SUB c22, b4, c22
+   LD b8,  AO,   7 * SIZE
+   SUB c31, b5, c31
+   LD b1,  AO,   8 * SIZE
+   SUB c32, b6, c32
+   LD b2,  AO,   9 * SIZE
+   SUB c41, b7, c41
+   LD b3,  AO,  10 * SIZE
+   SUB c42, b8, c42
+   LD b4,  AO,  11 * SIZE
+   LD b5,  AO,  12 * SIZE
+   SUB c51, b1, c51
+   LD b6,  AO,  13 * SIZE
+   SUB c52, b2, c52
+   LD b7,  AO,  14 * SIZE
+   SUB c61, b3, c61
+   LD b8,  AO,  15 * SIZE
+   SUB c62, b4, c62
+   SUB c71, b5, c71
+   SUB c72, b6, c72
+   SUB c81, b7, c81
+   SUB c82, b8, c82
+#endif
+#ifdef LN
+   MUL c12, b1, c12
+   LD b2,  AO,   2 * SIZE
+   MUL c22, b1, c22
+   MUL c32, b1, c32
+   MUL c42, b1, c42
+   MUL c52, b1, c52
+   MUL c62, b1, c62
+   MUL c72, b1, c72
+   MUL c82, b1, c82
+   NMSUB  c11, c12, b2, c11
+   LD b3,  AO,   0 * SIZE
+   NMSUB  c21, c22, b2, c21
+   NMSUB  c31, c32, b2, c31
+   NMSUB  c41, c42, b2, c41
+   NMSUB  c51, c52, b2, c51
+   NMSUB  c61, c62, b2, c61
+   NMSUB  c71, c72, b2, c71
+   NMSUB  c81, c82, b2, c81
+   MUL c11, b3, c11
+   addi.d  CO1, CO1, -2 * SIZE
+   MUL c21, b3, c21
+   addi.d  CO2, CO2, -2 * SIZE
+   MUL c31, b3, c31
+   addi.d  CO3, CO3, -2 * SIZE
+   MUL c41, b3, c41
+   addi.d  CO4, CO4, -2 * SIZE
+   MUL c51, b3, c51
+   addi.d  CO5, CO5, -2 * SIZE
+   MUL c61, b3, c61
+   addi.d  CO6, CO6, -2 * SIZE
+   MUL c71, b3, c71
+   addi.d  CO7, CO7, -2 * SIZE
+   MUL c81, b3, c81
+   addi.d  CO8, CO8, -2 * SIZE
+#endif
+#ifdef LT
+   MUL c11, b1, c11
+   LD b2,  AO,   1 * SIZE
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   MUL c51, b1, c51
+   MUL c61, b1, c61
+   MUL c71, b1, c71
+   MUL c81, b1, c81
+   NMSUB  c12, c11, b2, c12
+   LD b3,  AO,   3 * SIZE
+   NMSUB  c22, c21, b2, c22
+   NMSUB  c32, c31, b2, c32
+   NMSUB  c42, c41, b2, c42
+   NMSUB  c52, c51, b2, c52
+   NMSUB  c62, c61, b2, c62
+   NMSUB  c72, c71, b2, c72
+   NMSUB  c82, c81, b2, c82
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+   MUL c32, b3, c32
+   MUL c42, b3, c42
+   MUL c52, b3, c52
+   MUL c62, b3, c62
+   MUL c72, b3, c72
+   MUL c82, b3, c82
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   LD b5,  BO,   4 * SIZE
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   LD b6,  BO,   5 * SIZE
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c32, c12, b3, c32
+   LD b7,  BO,   6 * SIZE
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c42, c12, b4, c42
+   LD b8,  BO,   7 * SIZE
+   NMSUB  c51, c11, b5, c51
+   NMSUB  c52, c12, b5, c52
+   LD b2,  BO,   9 * SIZE
+   NMSUB  c61, c11, b6, c61
+   NMSUB  c62, c12, b6, c62
+   LD b3,  BO,  10 * SIZE
+   NMSUB  c71, c11, b7, c71
+   NMSUB  c72, c12, b7, c72
+   LD b4,  BO,  11 * SIZE
+   NMSUB  c81, c11, b8, c81
+   NMSUB  c82, c12, b8, c82
+   LD b5,  BO,  12 * SIZE
+   MUL c21, b2, c21
+   MUL c22, b2, c22
+   LD b6,  BO,  13 * SIZE
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c32, c22, b3, c32
+   LD b7,  BO,  14 * SIZE
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c42, c22, b4, c42
+   LD b8,  BO,  15 * SIZE
+   NMSUB  c51, c21, b5, c51
+   NMSUB  c52, c22, b5, c52
+   LD b3,  BO,  18 * SIZE
+   NMSUB  c61, c21, b6, c61
+   NMSUB  c62, c22, b6, c62
+   LD b4,  BO,  19 * SIZE
+   NMSUB  c71, c21, b7, c71
+   NMSUB  c72, c22, b7, c72
+   LD b5,  BO,  20 * SIZE
+   NMSUB  c81, c21, b8, c81
+   NMSUB  c82, c22, b8, c82
+   LD b6,  BO,  21 * SIZE
+   MUL c31, b3, c31
+   MUL c32, b3, c32
+   LD b7,  BO,  22 * SIZE
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c42, c32, b4, c42
+   LD b8,  BO,  23 * SIZE
+   NMSUB  c51, c31, b5, c51
+   NMSUB  c52, c32, b5, c52
+   LD b4,  BO,  27 * SIZE
+   NMSUB  c61, c31, b6, c61
+   NMSUB  c62, c32, b6, c62
+   LD b5,  BO,  28 * SIZE
+   NMSUB  c71, c31, b7, c71
+   NMSUB  c72, c32, b7, c72
+   LD b6,  BO,  29 * SIZE
+   NMSUB  c81, c31, b8, c81
+   NMSUB  c82, c32, b8, c82
+   LD b7,  BO,  30 * SIZE
+   MUL c41, b4, c41
+   MUL c42, b4, c42
+   LD b8,  BO,  31 * SIZE
+   NMSUB  c51, c41, b5, c51
+   NMSUB  c52, c42, b5, c52
+   LD b5,  BO,  36 * SIZE
+   NMSUB  c61, c41, b6, c61
+   NMSUB  c62, c42, b6, c62
+   LD b6,  BO,  37 * SIZE
+   NMSUB  c71, c41, b7, c71
+   NMSUB  c72, c42, b7, c72
+   LD b7,  BO,  38 * SIZE
+   NMSUB  c81, c41, b8, c81
+   NMSUB  c82, c42, b8, c82
+   LD b8,  BO,  39 * SIZE
+   MUL c51, b5, c51
+   MUL c52, b5, c52
+   NMSUB  c61, c51, b6, c61
+   NMSUB  c62, c52, b6, c62
+   LD b6,  BO,  45 * SIZE
+   NMSUB  c71, c51, b7, c71
+   NMSUB  c72, c52, b7, c72
+   LD b7,  BO,  46 * SIZE
+   NMSUB  c81, c51, b8, c81
+   NMSUB  c82, c52, b8, c82
+   LD b8,  BO,  47 * SIZE
+   MUL c61, b6, c61
+   MUL c62, b6, c62
+   NMSUB  c71, c61, b7, c71
+   NMSUB  c72, c62, b7, c72
+   LD b7,  BO,  54 * SIZE
+   NMSUB  c81, c61, b8, c81
+   NMSUB  c82, c62, b8, c82
+   LD b8,  BO,  55 * SIZE
+   MUL c71, b7, c71
+   MUL c72, b7, c72
+   NMSUB  c81, c71, b8, c81
+   NMSUB  c82, c72, b8, c82
+   LD b8,  BO,  63 * SIZE
+   MUL c81, b8, c81
+   MUL c82, b8, c82
+#endif
+#ifdef RT
+   LD b1,  BO,  63 * SIZE
+   LD b2,  BO,  62 * SIZE
+   LD b3,  BO,  61 * SIZE
+   LD b4,  BO,  60 * SIZE
+   MUL c81, b1, c81
+   MUL c82, b1, c82
+   LD b5,  BO,  59 * SIZE
+   NMSUB  c71, c81, b2, c71
+   NMSUB  c72, c82, b2, c72
+   LD b6,  BO,  58 * SIZE
+   NMSUB  c61, c81, b3, c61
+   NMSUB  c62, c82, b3, c62
+   LD b7,  BO,  57 * SIZE
+   NMSUB  c51, c81, b4, c51
+   NMSUB  c52, c82, b4, c52
+   LD b8,  BO,  56 * SIZE
+   NMSUB  c41, c81, b5, c41
+   NMSUB  c42, c82, b5, c42
+   LD b2,  BO,  54 * SIZE
+   NMSUB  c31, c81, b6, c31
+   NMSUB  c32, c82, b6, c32
+   LD b3,  BO,  53 * SIZE
+   NMSUB  c21, c81, b7, c21
+   NMSUB  c22, c82, b7, c22
+   LD b4,  BO,  52 * SIZE
+   NMSUB  c11, c81, b8, c11
+   NMSUB  c12, c82, b8, c12
+   LD b5,  BO,  51 * SIZE
+   MUL c71, b2, c71
+   MUL c72, b2, c72
+   LD b6,  BO,  50 * SIZE
+   NMSUB  c61, c71, b3, c61
+   NMSUB  c62, c72, b3, c62
+   LD b7,  BO,  49 * SIZE
+   NMSUB  c51, c71, b4, c51
+   NMSUB  c52, c72, b4, c52
+   LD b8,  BO,  48 * SIZE
+   NMSUB  c41, c71, b5, c41
+   NMSUB  c42, c72, b5, c42
+   LD b3,  BO,  45 * SIZE
+   NMSUB  c31, c71, b6, c31
+   NMSUB  c32, c72, b6, c32
+   LD b4,  BO,  44 * SIZE
+   NMSUB  c21, c71, b7, c21
+   NMSUB  c22, c72, b7, c22
+   LD b5,  BO,  43 * SIZE
+   NMSUB  c11, c71, b8, c11
+   NMSUB  c12, c72, b8, c12
+   LD b6,  BO,  42 * SIZE
+   MUL c61, b3, c61
+   MUL c62, b3, c62
+   LD b7,  BO,  41 * SIZE
+   NMSUB  c51, c61, b4, c51
+   NMSUB  c52, c62, b4, c52
+   LD b8,  BO,  40 * SIZE
+   NMSUB  c41, c61, b5, c41
+   NMSUB  c42, c62, b5, c42
+   LD b4,  BO,  36 * SIZE
+   NMSUB  c31, c61, b6, c31
+   NMSUB  c32, c62, b6, c32
+   LD b5,  BO,  35 * SIZE
+   NMSUB  c21, c61, b7, c21
+   NMSUB  c22, c62, b7, c22
+   LD b6,  BO,  34 * SIZE
+   NMSUB  c11, c61, b8, c11
+   NMSUB  c12, c62, b8, c12
+   LD b7,  BO,  33 * SIZE
+   MUL c51, b4, c51
+   MUL c52, b4, c52
+   LD b8,  BO,  32 * SIZE
+   NMSUB  c41, c51, b5, c41
+   NMSUB  c42, c52, b5, c42
+   LD b5,  BO,  27 * SIZE
+   NMSUB  c31, c51, b6, c31
+   NMSUB  c32, c52, b6, c32
+   LD b6,  BO,  26 * SIZE
+   NMSUB  c21, c51, b7, c21
+   NMSUB  c22, c52, b7, c22
+   LD b7,  BO,  25 * SIZE
+   NMSUB  c11, c51, b8, c11
+   NMSUB  c12, c52, b8, c12
+   LD b8,  BO,  24 * SIZE
+   MUL c41, b5, c41
+   MUL c42, b5, c42
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c32, c42, b6, c32
+   LD b6,  BO,  18 * SIZE
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c22, c42, b7, c22
+   LD b7,  BO,  17 * SIZE
+   NMSUB  c11, c41, b8, c11
+   NMSUB  c12, c42, b8, c12
+   LD b8,  BO,  16 * SIZE
+   MUL c31, b6, c31
+   MUL c32, b6, c32
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c22, c32, b7, c22
+   LD b7,  BO,   9 * SIZE
+   NMSUB  c11, c31, b8, c11
+   NMSUB  c12, c32, b8, c12
+   LD b8,  BO,   8 * SIZE
+   MUL c21, b7, c21
+   MUL c22, b7, c22
+   NMSUB  c11, c21, b8, c11
+   NMSUB  c12, c22, b8, c12
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+   MUL c12, b8, c12
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c61,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c81,  BO,   7 * SIZE
+   ST c12,  BO,   8 * SIZE
+   ST c22,  BO,   9 * SIZE
+   ST c32,  BO,  10 * SIZE
+   ST c42,  BO,  11 * SIZE
+   ST c52,  BO,  12 * SIZE
+   ST c62,  BO,  13 * SIZE
+   ST c72,  BO,  14 * SIZE
+   ST c82,  BO,  15 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+   ST c31,  AO,   4 * SIZE
+   ST c32,  AO,   5 * SIZE
+   ST c41,  AO,   6 * SIZE
+   ST c42,  AO,   7 * SIZE
+   ST c51,  AO,   8 * SIZE
+   ST c52,  AO,   9 * SIZE
+   ST c61,  AO,  10 * SIZE
+   ST c62,  AO,  11 * SIZE
+   ST c71,  AO,  12 * SIZE
+   ST c72,  AO,  13 * SIZE
+   ST c81,  AO,  14 * SIZE
+   ST c82,  AO,  15 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c32,  CO3,   1 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c42,  CO4,   1 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c52,  CO5,   1 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c62,  CO6,   1 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c72,  CO7,   1 * SIZE
+   ST c81,  CO8,   0 * SIZE
+   ST c82,  CO8,   1 * SIZE
+MTC  a1, $r0
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+   addi.d  CO3, CO3, 2 * SIZE
+   addi.d  CO4, CO4, 2 * SIZE
+   addi.d  CO5, CO5, 2 * SIZE
+   addi.d  CO6, CO6, 2 * SIZE
+   addi.d  CO7, CO7, 2 * SIZE
+   addi.d  CO8, CO8, 2 * SIZE
+#endif
+   MOV c11, a1
+   MOV c21, a1
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+   MOV c31, a1
+   MOV c41, a1
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+   addi.d  I, I, -1
+   MOV c51, a1
+MOV    c61, a1
+   blt $r0,    I, .L11
+   .align 3
+
+.L29:
+#ifdef LN
+   slli.d  TEMP, K, 3 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  8
+#endif
+#ifdef RT
+   addi.d  KK, KK, -8
+#endif
+   blt $r0,    J, .L10
+   .align 3
+
+.L30:
+   andi    J,  N, 4
+move   AO, A
+   bge $r0,    J, .L50
+#ifdef RT
+   slli.d  TEMP, K, 2 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 2
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   MOV c21, c11
+   add.d   CO4, CO3,    LDC
+   MOV c31, c11
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO4,    LDC
+#endif
+   andi    I,  M, 1
+MOV    c41, c11
+   bge $r0,    I, .L40
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+move   BO,  B
+   bge $r0,    L, .L45
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L45
+#endif
+   .align  3
+.L42:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b5, a2, c11
+   LD b5,  BO,  20 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  11 * SIZE
+   LD a2,  AO,   2 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   LD a2,  AO,  -1 * SIZE
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c11, b7, a2, c11
+   LD b7,  BO,  12 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   1 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   2 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   3 * SIZE
+   LD a2,  AO,   1 * SIZE
+   blt $r0,    L, .L42
+   .align 3
+
+.L45:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L48
+   .align  3
+.L46:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   1 * SIZE
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+   MOV a2, a2
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L46
+.L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c41, c11, b4, c41
+   LD b2,  BO,   5 * SIZE
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL c21, b2, c21
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c41, c21, b4, c41
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   MUL c31, b3, c31
+   NMSUB  c41, c31, b4, c41
+   LD b4,  BO,  15 * SIZE
+   MUL c41, b4, c41
+#endif
+#ifdef RT
+   LD b5,  BO,  15 * SIZE
+   LD b6,  BO,  14 * SIZE
+   LD b7,  BO,  13 * SIZE
+   LD b8,  BO,  12 * SIZE
+   MUL c41, b5, c41
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c11, c41, b8, c11
+   LD b6,  BO,  10 * SIZE
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c31, b6, c31
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c11, c31, b8, c11
+   LD b7,  BO,   5 * SIZE
+   LD b8,  BO,   4 * SIZE
+   MUL c21, b7, c21
+   NMSUB  c11, c21, b8, c11
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+   addi.d  CO3, CO3, -1 * SIZE
+   addi.d  CO4, CO4, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c41,  AO,   3 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+MTC  c11, $r0
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+   addi.d  CO3, CO3, 1 * SIZE
+   addi.d  CO4, CO4, 1 * SIZE
+#endif
+   MOV c21, c11
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+   MOV c31, c11
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L40:
+   srai.d  I,  M, 1
+   MOV c61, c11
+MOV    c41, c11
+   bge $r0,    I, .L49
+.L31:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   MOV c32, c11
+   LD b4,  B,   3 * SIZE
+   MOV c42, c11
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L35
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c32, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c42, c11
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c11, b7, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c31, b3, a3, c31
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD  c12, b7, a2, c12
+   LD b7,  BO,  12 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   0 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   4 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L36
+.L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c12, b5, c12
+   SUB c22, b6, c22
+   SUB c32, b7, c32
+   SUB c42, b8, c42
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c21, b3, c21
+   SUB c22, b4, c22
+   SUB c31, b5, c31
+   SUB c32, b6, c32
+   SUB c41, b7, c41
+   SUB c42, b8, c42
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   MUL c22, b1, c22
+   MUL c32, b1, c32
+   MUL c42, b1, c42
+   NMSUB  c11, c12, b2, c11
+   NMSUB  c21, c22, b2, c21
+   NMSUB  c31, c32, b2, c31
+   NMSUB  c41, c42, b2, c41
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+   MUL c31, b3, c31
+   MUL c41, b3, c41
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   NMSUB  c12, c11, b2, c12
+   NMSUB  c22, c21, b2, c22
+   NMSUB  c32, c31, b2, c32
+   NMSUB  c42, c41, b2, c42
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+   MUL c32, b3, c32
+   MUL c42, b3, c42
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c32, c12, b3, c32
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c42, c12, b4, c42
+   LD b2,  BO,   5 * SIZE
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL c21, b2, c21
+   MUL c22, b2, c22
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c32, c22, b3, c32
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c42, c22, b4, c42
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   MUL c31, b3, c31
+   MUL c32, b3, c32
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c42, c32, b4, c42
+   LD b4,  BO,  15 * SIZE
+   MUL c41, b4, c41
+   MUL c42, b4, c42
+#endif
+#ifdef RT
+   LD b5,  BO,  15 * SIZE
+   LD b6,  BO,  14 * SIZE
+   LD b7,  BO,  13 * SIZE
+   LD b8,  BO,  12 * SIZE
+   MUL c41, b5, c41
+   MUL c42, b5, c42
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c32, c42, b6, c32
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c22, c42, b7, c22
+   NMSUB  c11, c41, b8, c11
+   NMSUB  c12, c42, b8, c12
+   LD b6,  BO,  10 * SIZE
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c31, b6, c31
+   MUL c32, b6, c32
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c22, c32, b7, c22
+   NMSUB  c11, c31, b8, c11
+   NMSUB  c12, c32, b8, c12
+   LD b7,  BO,   5 * SIZE
+   LD b8,  BO,   4 * SIZE
+   MUL c21, b7, c21
+   MUL c22, b7, c22
+   NMSUB  c11, c21, b8, c11
+   NMSUB  c12, c22, b8, c12
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+   MUL c12, b8, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+   addi.d  CO2, CO2, -2 * SIZE
+   addi.d  CO3, CO3, -2 * SIZE
+   addi.d  CO4, CO4, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c12,  BO,   4 * SIZE
+   ST c22,  BO,   5 * SIZE
+   ST c32,  BO,   6 * SIZE
+   ST c42,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+   ST c31,  AO,   4 * SIZE
+   ST c32,  AO,   5 * SIZE
+   ST c41,  AO,   6 * SIZE
+   ST c42,  AO,   7 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c32,  CO3,   1 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c42,  CO4,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+   addi.d  CO3, CO3, 2 * SIZE
+   addi.d  CO4, CO4, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+MTC  a1, $r0
+   MOV c11, a1
+   MOV c21, a1
+   MOV c31, a1
+   addi.d  I, I, -1
+MOV    c41, c11
+   blt $r0,    I, .L31
+   .align 3
+
+.L49:
+#ifdef LN
+   slli.d  TEMP, K, 2 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  4
+#endif
+#ifdef RT
+   addi.d  KK, KK, -4
+#endif
+   .align 3
+
+.L50:
+   andi    J,  N, 2
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+#else
+   move    AO, A
+#endif
+   bge $r0,    J, .L70
+#ifdef RT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 1
+   sub.d   C, C, TEMP
+#endif
+   move    AO, A
+   move    CO1, C
+   add.d   CO2, C,      LDC
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO2,    LDC
+#endif
+   andi    I,  M, 1
+   bge $r0,    I, .L60
+#if defined(LT) || defined(RN)
+   srai.d  L,  KK, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L65
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   srai.d  L,  TEMP, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L65
+#endif
+   .align  3
+.L62:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  11 * SIZE
+   LD a3,  AO,   6 * SIZE
+   LD a4,  AO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L62
+   .align 3
+
+.L65:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L68
+   .align  3
+.L66:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   3 * SIZE
+   LD a1,  AO,   1 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L66
+.L68:
+   ADD c11, c11, c31
+   ADD c21, c21, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+#endif
+#if defined(LN) || defined(LT)
+   LD b3,  AO,   0 * SIZE
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   MUL c21, b3, c21
+#endif
+#ifdef RT
+   LD b1,  BO,   3 * SIZE
+   LD b2,  BO,   2 * SIZE
+   LD b3,  BO,   0 * SIZE
+   MUL c21, b1, c21
+   NMSUB  c11, c21, b2, c11
+   MUL c11, b3, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 0 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L60:
+   srai.d  I,  M, 1
+   bge $r0,    I, .L69
+.L51:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L55
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L55
+#endif
+   .align  3
+.L52:
+   MADD  c11, b1, a1, c11
+   LD a3,  AO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b4,  BO,   3 * SIZE
+   MADD  c12, b1, a2, c12
+   LD a4,  AO,   3 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b1,  BO,   8 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a1,  AO,   8 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   5 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,   6 * SIZE
+   MADD  c11, b5, a5, c11
+   LD a3,  AO,   6 * SIZE
+   MADD  c21, b2, a5, c21
+   LD b4,  BO,   7 * SIZE
+   MADD  c12, b5, a2, c12
+   LD a4,  AO,   7 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b5,  BO,  12 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a5,  AO,  12 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   9 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,  10 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L52
+   .align 3
+
+.L55:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L58
+   .align  3
+.L56:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   3 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L56
+.L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c12, b3, c12
+   SUB c22, b4, c22
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c21, b3, c21
+   SUB c22, b4, c22
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   MUL c22, b1, c22
+   NMSUB  c11, c12, b2, c11
+   NMSUB  c21, c22, b2, c21
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   NMSUB  c12, c11, b2, c12
+   NMSUB  c22, c21, b2, c22
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   MUL c21, b3, c21
+   MUL c22, b3, c22
+#endif
+#ifdef RT
+   LD b1,  BO,   3 * SIZE
+   LD b2,  BO,   2 * SIZE
+   LD b3,  BO,   0 * SIZE
+   MUL c21, b1, c21
+   MUL c22, b1, c22
+   NMSUB  c11, c21, b2, c11
+   NMSUB  c12, c22, b2, c12
+   MUL c11, b3, c11
+   MUL c12, b3, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+   addi.d  CO2, CO2, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c12,  BO,   2 * SIZE
+   ST c22,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+MTC  a1, $r0
+   MOV c11, a1
+   MOV c21, a1
+   MOV c31, a1
+   addi.d  I, I, -1
+MOV    c41, c11
+   blt $r0,    I, .L51
+   .align 3
+
+.L69:
+#ifdef LN
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  2
+#endif
+#ifdef RT
+   addi.d  KK, KK, -2
+#endif
+   .align 3
+
+.L70:
+   andi    J,  N, 1
+   bge $r0,    J, .L999
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   sub.d   B, B, TEMP
+   sub.d   C, C,    LDC
+#endif
+   move    AO, A
+   move    CO1, C
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO1,    LDC
+#endif
+   andi    I,  M, 1
+   bge $r0,    I, .L80
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+move   BO,  B
+   bge $r0,    L, .L85
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  TEMP, KK, BASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L85
+#endif
+   .align  3
+.L82:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   1 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c21, b1, a1, c21
+   LD a1,  AO,   2 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   3 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c21, b1, a1, c21
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L82
+   .align 3
+
+.L85:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L88
+   .align  3
+.L86:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L86
+.L88:
+   ADD c11, c11, c21
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -1
+#endif
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   SUB c11, b1, c11
+#else
+   LD b1,  AO,   0 * SIZE
+   SUB c11, b1, c11
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   MUL c11, b1, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L80:
+   srai.d  I,  M, 1
+   bge $r0,    I, .L89
+.L71:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L75
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 0 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L75
+#endif
+   .align  3
+.L72:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   2 * SIZE
+   LD a2,  AO,   3 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   6 * SIZE
+   LD a2,  AO,   7 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  8 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L72
+   .align 3
+
+.L75:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L78
+   .align  3
+.L76:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L76
+.L78:
+   ADD c11, c11, c21
+   ADD c12, c12, c22
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -1
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   NMSUB  c11, c12, b2, c11
+   MUL c11, b3, c11
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c12, c11, b2, c12
+   MUL c12, b3, c12
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L71
+   .align 3
+
+.L89:
+#ifdef LN
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  1
+#endif
+#ifdef RT
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   fld.d  $f24,  $sp,  48
+   fld.d  $f25,  $sp,  56
+   fld.d  $f26,  $sp,  64
+   fld.d  $f27,  $sp,  72
+   fld.d  $f28,  $sp,  80
+   LDARG  $r29,  $sp,   88
+   LDARG  $r30,  $sp,   96
+   LDARG  $r20,  $sp,  104
+   LDARG  $r16,  $sp,  112
+#ifndef __64BIT__
+   fld.d  $f18,  $sp, 112
+   fld.d  $f19,  $sp, 120
+   fld.d  $f20,  $sp, 128
+   fld.d  $f21,  $sp, 136
+#endif
+   addi.d  $sp, $sp, 144
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/trsm_kernel_LT.S b/kernel/loongarch64/trsm_kernel_LT.S
new file mode 100644
index 000000000..aa6822c32
--- /dev/null
+++ b/kernel/loongarch64/trsm_kernel_LT.S
@@ -0,0 +1,2854 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+#define OFFSET $r11
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r29
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define CO5    $r25
+#define CO6    $r26
+#define CO7    $r27
+#define CO8    $r28
+#define KK     $r30
+#define TEMP   $r20
+#define AORIG  $r16
+#define a1     $f22
+#define a2     $f8
+#define a3     $f27
+#define a4     $f28
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f3
+#define c22    $f1
+#define c31    $f2
+#define c32    $f4
+#define c41    $f5
+#define c42    $f6
+#define c51    $f7
+#define c52    $f18
+#define c61    $f19
+#define c62    $f20
+#define c71    $f21
+#define c72    $f24
+#define c81    $f25
+#define c82    $f26
+#define ALPHA  $f0
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -144
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   fst.d  $f24,  $sp,  48
+   fst.d  $f25,  $sp,  56
+   fst.d  $f26,  $sp,  64
+   fst.d  $f27,  $sp,  72
+   fst.d  $f28,  $sp,  80
+   SDARG  $r29,  $sp,   88
+   SDARG  $r30,  $sp,   96
+   SDARG  $r20,  $sp,  104
+   SDARG  $r16,  $sp,  112
+#ifndef __64BIT__
+   fst.d  $f18,  $sp, 112
+   fst.d  $f19,  $sp, 120
+   fst.d  $f20,  $sp, 128
+   fst.d  $f21,  $sp, 136
+#endif
+   slli.d     LDC,    LDC, BASE_SHIFT
+#ifdef LN
+        mul.w   TEMP, M, K
+   slli.d  TEMP, TEMP, BASE_SHIFT
+   add.d   A, A, TEMP
+   slli.d  TEMP, M, BASE_SHIFT
+   add.d   C, C, TEMP
+#endif
+#ifdef RN
+        sub.d   KK, $r0, OFFSET
+#endif
+#ifdef RT
+        mul.w   TEMP, N, K
+   slli.d  TEMP, TEMP, BASE_SHIFT
+   add.d   B, B, TEMP
+        mul.w   TEMP, N,    LDC
+   add.d   C, C, TEMP
+   sub.d   KK, N, OFFSET
+#endif
+   srai.d  J,  N, 3
+nop
+   bge $r0,    J, .L30
+.L10:
+#ifdef RT
+   slli.d  TEMP, K, 3 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 3
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   CO5, CO4,    LDC
+   MOV c31, c11
+   add.d   CO6, CO5,    LDC
+   MOV c41, c11
+   add.d   CO7, CO6,    LDC
+   MOV c51, c11
+   add.d   CO8, CO7,    LDC
+   srai.d  I,  M, 1
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO8,    LDC
+#endif
+MOV    c61, c11
+   bge $r0,    I, .L20
+.L11:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  KK, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  TEMP, 2
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+   bge $r0,    L, .L15
+#endif
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -8
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   LD b5,  BO,   4 * SIZE
+   SUB c21, b2, c21
+   LD b6,  BO,   5 * SIZE
+   SUB c31, b3, c31
+   LD b7,  BO,   6 * SIZE
+   SUB c41, b4, c41
+   LD b8,  BO,   7 * SIZE
+   SUB c51, b5, c51
+   LD b1,  BO,   8 * SIZE
+   SUB c61, b6, c61
+   LD b2,  BO,   9 * SIZE
+   SUB c71, b7, c71
+   LD b3,  BO,  10 * SIZE
+   SUB c81, b8, c81
+   LD b4,  BO,  11 * SIZE
+   SUB c12, b1, c12
+   LD b5,  BO,  12 * SIZE
+   SUB c22, b2, c22
+   LD b6,  BO,  13 * SIZE
+   SUB c32, b3, c32
+   LD b7,  BO,  14 * SIZE
+   SUB c42, b4, c42
+   LD b8,  BO,  15 * SIZE
+   SUB c52, b5, c52
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+#else
+   LD b1,  AO,   0 * SIZE
+#endif
+   SUB c62, b6, c62
+   SUB c72, b7, c72
+   SUB c82, b8, c82
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   LD b5,  AO,   4 * SIZE
+   SUB c12, b2, c12
+   LD b6,  AO,   5 * SIZE
+   SUB c21, b3, c21
+   LD b7,  AO,   6 * SIZE
+   SUB c22, b4, c22
+   LD b8,  AO,   7 * SIZE
+   SUB c31, b5, c31
+   LD b1,  AO,   8 * SIZE
+   SUB c32, b6, c32
+   LD b2,  AO,   9 * SIZE
+   SUB c41, b7, c41
+   LD b3,  AO,  10 * SIZE
+   SUB c42, b8, c42
+   LD b4,  AO,  11 * SIZE
+   LD b5,  AO,  12 * SIZE
+   SUB c51, b1, c51
+   LD b6,  AO,  13 * SIZE
+   SUB c52, b2, c52
+   LD b7,  AO,  14 * SIZE
+   SUB c61, b3, c61
+   LD b8,  AO,  15 * SIZE
+   SUB c62, b4, c62
+   SUB c71, b5, c71
+   SUB c72, b6, c72
+   SUB c81, b7, c81
+   SUB c82, b8, c82
+#endif
+#ifdef LN
+   MUL c12, b1, c12
+   LD b2,  AO,   2 * SIZE
+   MUL c22, b1, c22
+   MUL c32, b1, c32
+   MUL c42, b1, c42
+   MUL c52, b1, c52
+   MUL c62, b1, c62
+   MUL c72, b1, c72
+   MUL c82, b1, c82
+   NMSUB  c11, c12, b2, c11
+   LD b3,  AO,   0 * SIZE
+   NMSUB  c21, c22, b2, c21
+   NMSUB  c31, c32, b2, c31
+   NMSUB  c41, c42, b2, c41
+   NMSUB  c51, c52, b2, c51
+   NMSUB  c61, c62, b2, c61
+   NMSUB  c71, c72, b2, c71
+   NMSUB  c81, c82, b2, c81
+   MUL c11, b3, c11
+   addi.d  CO1, CO1, -2 * SIZE
+   MUL c21, b3, c21
+   addi.d  CO2, CO2, -2 * SIZE
+   MUL c31, b3, c31
+   addi.d  CO3, CO3, -2 * SIZE
+   MUL c41, b3, c41
+   addi.d  CO4, CO4, -2 * SIZE
+   MUL c51, b3, c51
+   addi.d  CO5, CO5, -2 * SIZE
+   MUL c61, b3, c61
+   addi.d  CO6, CO6, -2 * SIZE
+   MUL c71, b3, c71
+   addi.d  CO7, CO7, -2 * SIZE
+   MUL c81, b3, c81
+   addi.d  CO8, CO8, -2 * SIZE
+#endif
+#ifdef LT
+   MUL c11, b1, c11
+   LD b2,  AO,   1 * SIZE
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   MUL c51, b1, c51
+   MUL c61, b1, c61
+   MUL c71, b1, c71
+   MUL c81, b1, c81
+   NMSUB  c12, c11, b2, c12
+   LD b3,  AO,   3 * SIZE
+   NMSUB  c22, c21, b2, c22
+   NMSUB  c32, c31, b2, c32
+   NMSUB  c42, c41, b2, c42
+   NMSUB  c52, c51, b2, c52
+   NMSUB  c62, c61, b2, c62
+   NMSUB  c72, c71, b2, c72
+   NMSUB  c82, c81, b2, c82
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+   MUL c32, b3, c32
+   MUL c42, b3, c42
+   MUL c52, b3, c52
+   MUL c62, b3, c62
+   MUL c72, b3, c72
+   MUL c82, b3, c82
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   LD b5,  BO,   4 * SIZE
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   LD b6,  BO,   5 * SIZE
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c32, c12, b3, c32
+   LD b7,  BO,   6 * SIZE
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c42, c12, b4, c42
+   LD b8,  BO,   7 * SIZE
+   NMSUB  c51, c11, b5, c51
+   NMSUB  c52, c12, b5, c52
+   LD b2,  BO,   9 * SIZE
+   NMSUB  c61, c11, b6, c61
+   NMSUB  c62, c12, b6, c62
+   LD b3,  BO,  10 * SIZE
+   NMSUB  c71, c11, b7, c71
+   NMSUB  c72, c12, b7, c72
+   LD b4,  BO,  11 * SIZE
+   NMSUB  c81, c11, b8, c81
+   NMSUB  c82, c12, b8, c82
+   LD b5,  BO,  12 * SIZE
+   MUL c21, b2, c21
+   MUL c22, b2, c22
+   LD b6,  BO,  13 * SIZE
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c32, c22, b3, c32
+   LD b7,  BO,  14 * SIZE
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c42, c22, b4, c42
+   LD b8,  BO,  15 * SIZE
+   NMSUB  c51, c21, b5, c51
+   NMSUB  c52, c22, b5, c52
+   LD b3,  BO,  18 * SIZE
+   NMSUB  c61, c21, b6, c61
+   NMSUB  c62, c22, b6, c62
+   LD b4,  BO,  19 * SIZE
+   NMSUB  c71, c21, b7, c71
+   NMSUB  c72, c22, b7, c72
+   LD b5,  BO,  20 * SIZE
+   NMSUB  c81, c21, b8, c81
+   NMSUB  c82, c22, b8, c82
+   LD b6,  BO,  21 * SIZE
+   MUL c31, b3, c31
+   MUL c32, b3, c32
+   LD b7,  BO,  22 * SIZE
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c42, c32, b4, c42
+   LD b8,  BO,  23 * SIZE
+   NMSUB  c51, c31, b5, c51
+   NMSUB  c52, c32, b5, c52
+   LD b4,  BO,  27 * SIZE
+   NMSUB  c61, c31, b6, c61
+   NMSUB  c62, c32, b6, c62
+   LD b5,  BO,  28 * SIZE
+   NMSUB  c71, c31, b7, c71
+   NMSUB  c72, c32, b7, c72
+   LD b6,  BO,  29 * SIZE
+   NMSUB  c81, c31, b8, c81
+   NMSUB  c82, c32, b8, c82
+   LD b7,  BO,  30 * SIZE
+   MUL c41, b4, c41
+   MUL c42, b4, c42
+   LD b8,  BO,  31 * SIZE
+   NMSUB  c51, c41, b5, c51
+   NMSUB  c52, c42, b5, c52
+   LD b5,  BO,  36 * SIZE
+   NMSUB  c61, c41, b6, c61
+   NMSUB  c62, c42, b6, c62
+   LD b6,  BO,  37 * SIZE
+   NMSUB  c71, c41, b7, c71
+   NMSUB  c72, c42, b7, c72
+   LD b7,  BO,  38 * SIZE
+   NMSUB  c81, c41, b8, c81
+   NMSUB  c82, c42, b8, c82
+   LD b8,  BO,  39 * SIZE
+   MUL c51, b5, c51
+   MUL c52, b5, c52
+   NMSUB  c61, c51, b6, c61
+   NMSUB  c62, c52, b6, c62
+   LD b6,  BO,  45 * SIZE
+   NMSUB  c71, c51, b7, c71
+   NMSUB  c72, c52, b7, c72
+   LD b7,  BO,  46 * SIZE
+   NMSUB  c81, c51, b8, c81
+   NMSUB  c82, c52, b8, c82
+   LD b8,  BO,  47 * SIZE
+   MUL c61, b6, c61
+   MUL c62, b6, c62
+   NMSUB  c71, c61, b7, c71
+   NMSUB  c72, c62, b7, c72
+   LD b7,  BO,  54 * SIZE
+   NMSUB  c81, c61, b8, c81
+   NMSUB  c82, c62, b8, c82
+   LD b8,  BO,  55 * SIZE
+   MUL c71, b7, c71
+   MUL c72, b7, c72
+   NMSUB  c81, c71, b8, c81
+   NMSUB  c82, c72, b8, c82
+   LD b8,  BO,  63 * SIZE
+   MUL c81, b8, c81
+   MUL c82, b8, c82
+#endif
+#ifdef RT
+   LD b1,  BO,  63 * SIZE
+   LD b2,  BO,  62 * SIZE
+   LD b3,  BO,  61 * SIZE
+   LD b4,  BO,  60 * SIZE
+   MUL c81, b1, c81
+   MUL c82, b1, c82
+   LD b5,  BO,  59 * SIZE
+   NMSUB  c71, c81, b2, c71
+   NMSUB  c72, c82, b2, c72
+   LD b6,  BO,  58 * SIZE
+   NMSUB  c61, c81, b3, c61
+   NMSUB  c62, c82, b3, c62
+   LD b7,  BO,  57 * SIZE
+   NMSUB  c51, c81, b4, c51
+   NMSUB  c52, c82, b4, c52
+   LD b8,  BO,  56 * SIZE
+   NMSUB  c41, c81, b5, c41
+   NMSUB  c42, c82, b5, c42
+   LD b2,  BO,  54 * SIZE
+   NMSUB  c31, c81, b6, c31
+   NMSUB  c32, c82, b6, c32
+   LD b3,  BO,  53 * SIZE
+   NMSUB  c21, c81, b7, c21
+   NMSUB  c22, c82, b7, c22
+   LD b4,  BO,  52 * SIZE
+   NMSUB  c11, c81, b8, c11
+   NMSUB  c12, c82, b8, c12
+   LD b5,  BO,  51 * SIZE
+   MUL c71, b2, c71
+   MUL c72, b2, c72
+   LD b6,  BO,  50 * SIZE
+   NMSUB  c61, c71, b3, c61
+   NMSUB  c62, c72, b3, c62
+   LD b7,  BO,  49 * SIZE
+   NMSUB  c51, c71, b4, c51
+   NMSUB  c52, c72, b4, c52
+   LD b8,  BO,  48 * SIZE
+   NMSUB  c41, c71, b5, c41
+   NMSUB  c42, c72, b5, c42
+   LD b3,  BO,  45 * SIZE
+   NMSUB  c31, c71, b6, c31
+   NMSUB  c32, c72, b6, c32
+   LD b4,  BO,  44 * SIZE
+   NMSUB  c21, c71, b7, c21
+   NMSUB  c22, c72, b7, c22
+   LD b5,  BO,  43 * SIZE
+   NMSUB  c11, c71, b8, c11
+   NMSUB  c12, c72, b8, c12
+   LD b6,  BO,  42 * SIZE
+   MUL c61, b3, c61
+   MUL c62, b3, c62
+   LD b7,  BO,  41 * SIZE
+   NMSUB  c51, c61, b4, c51
+   NMSUB  c52, c62, b4, c52
+   LD b8,  BO,  40 * SIZE
+   NMSUB  c41, c61, b5, c41
+   NMSUB  c42, c62, b5, c42
+   LD b4,  BO,  36 * SIZE
+   NMSUB  c31, c61, b6, c31
+   NMSUB  c32, c62, b6, c32
+   LD b5,  BO,  35 * SIZE
+   NMSUB  c21, c61, b7, c21
+   NMSUB  c22, c62, b7, c22
+   LD b6,  BO,  34 * SIZE
+   NMSUB  c11, c61, b8, c11
+   NMSUB  c12, c62, b8, c12
+   LD b7,  BO,  33 * SIZE
+   MUL c51, b4, c51
+   MUL c52, b4, c52
+   LD b8,  BO,  32 * SIZE
+   NMSUB  c41, c51, b5, c41
+   NMSUB  c42, c52, b5, c42
+   LD b5,  BO,  27 * SIZE
+   NMSUB  c31, c51, b6, c31
+   NMSUB  c32, c52, b6, c32
+   LD b6,  BO,  26 * SIZE
+   NMSUB  c21, c51, b7, c21
+   NMSUB  c22, c52, b7, c22
+   LD b7,  BO,  25 * SIZE
+   NMSUB  c11, c51, b8, c11
+   NMSUB  c12, c52, b8, c12
+   LD b8,  BO,  24 * SIZE
+   MUL c41, b5, c41
+   MUL c42, b5, c42
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c32, c42, b6, c32
+   LD b6,  BO,  18 * SIZE
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c22, c42, b7, c22
+   LD b7,  BO,  17 * SIZE
+   NMSUB  c11, c41, b8, c11
+   NMSUB  c12, c42, b8, c12
+   LD b8,  BO,  16 * SIZE
+   MUL c31, b6, c31
+   MUL c32, b6, c32
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c22, c32, b7, c22
+   LD b7,  BO,   9 * SIZE
+   NMSUB  c11, c31, b8, c11
+   NMSUB  c12, c32, b8, c12
+   LD b8,  BO,   8 * SIZE
+   MUL c21, b7, c21
+   MUL c22, b7, c22
+   NMSUB  c11, c21, b8, c11
+   NMSUB  c12, c22, b8, c12
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+   MUL c12, b8, c12
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c61,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c81,  BO,   7 * SIZE
+   ST c12,  BO,   8 * SIZE
+   ST c22,  BO,   9 * SIZE
+   ST c32,  BO,  10 * SIZE
+   ST c42,  BO,  11 * SIZE
+   ST c52,  BO,  12 * SIZE
+   ST c62,  BO,  13 * SIZE
+   ST c72,  BO,  14 * SIZE
+   ST c82,  BO,  15 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+   ST c31,  AO,   4 * SIZE
+   ST c32,  AO,   5 * SIZE
+   ST c41,  AO,   6 * SIZE
+   ST c42,  AO,   7 * SIZE
+   ST c51,  AO,   8 * SIZE
+   ST c52,  AO,   9 * SIZE
+   ST c61,  AO,  10 * SIZE
+   ST c62,  AO,  11 * SIZE
+   ST c71,  AO,  12 * SIZE
+   ST c72,  AO,  13 * SIZE
+   ST c81,  AO,  14 * SIZE
+   ST c82,  AO,  15 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c32,  CO3,   1 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c42,  CO4,   1 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c52,  CO5,   1 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c62,  CO6,   1 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c72,  CO7,   1 * SIZE
+   ST c81,  CO8,   0 * SIZE
+   ST c82,  CO8,   1 * SIZE
+MTC  a1, $r0
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+   addi.d  CO3, CO3, 2 * SIZE
+   addi.d  CO4, CO4, 2 * SIZE
+   addi.d  CO5, CO5, 2 * SIZE
+   addi.d  CO6, CO6, 2 * SIZE
+   addi.d  CO7, CO7, 2 * SIZE
+   addi.d  CO8, CO8, 2 * SIZE
+#endif
+   MOV c11, a1
+   MOV c21, a1
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+   MOV c31, a1
+   MOV c41, a1
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+   addi.d  I, I, -1
+   MOV c51, a1
+MOV    c61, a1
+   blt $r0,    I, .L11
+   .align 3
+
+.L20:
+   andi    I,  M, 1
+   MOV c61, c11
+MOV    c71, c11
+   bge $r0,    I, .L29
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+   MOV c81, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  0 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   MOV c81, c11
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,  20 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   9 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,  10 * SIZE
+   MADD  c81, b4, a1, c81
+   LD b4,  BO,  11 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a2, c51
+   LD b7,  BO,  28 * SIZE
+   MADD  c61, b2, a2, c61
+   LD b2,  BO,  17 * SIZE
+   MADD  c71, b3, a2, c71
+   LD b3,  BO,  18 * SIZE
+   MADD  c81, b4, a2, c81
+   LD b4,  BO,  19 * SIZE
+   LD a2,  AO,   5 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,  32 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,  21 * SIZE
+   MADD  c31, b3, a3, c31
+   LD b3,  BO,  22 * SIZE
+   MADD  c41, b4, a3, c41
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD b5,  BO,  36 * SIZE
+   MADD  c61, b2, a3, c61
+   LD b2,  BO,  25 * SIZE
+   MADD  c71, b3, a3, c71
+   LD b3,  BO,  26 * SIZE
+   MADD  c81, b4, a3, c81
+   LD b4,  BO,  27 * SIZE
+   LD a3,  AO,   2 * SIZE
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c11, b6, a4, c11
+   LD b6,  BO,   8 * SIZE
+   MADD  c21, b2, a4, c21
+   LD b2,  BO,  -3 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  -2 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  -1 * SIZE
+   MADD  c51, b7, a4, c51
+   LD b7,  BO,  12 * SIZE
+   MADD  c61, b2, a4, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a4, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a4, c81
+   LD b4,  BO,   3 * SIZE
+   LD a4,  AO,   3 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   MOV a2, a2
+   addi.d  AO, AO,  1 * SIZE
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,   4 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L26
+.L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -8
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c51, b5, c51
+   SUB c61, b6, c61
+   SUB c71, b7, c71
+   SUB c81, b8, c81
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c51, b5, c51
+   SUB c61, b6, c61
+   SUB c71, b7, c71
+   SUB c81, b8, c81
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   MUL c51, b1, c51
+   MUL c61, b1, c61
+   MUL c71, b1, c71
+   MUL c81, b1, c81
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c51, c11, b5, c51
+   NMSUB  c61, c11, b6, c61
+   NMSUB  c71, c11, b7, c71
+   NMSUB  c81, c11, b8, c81
+   LD b2,  BO,   9 * SIZE
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   LD b5,  BO,  12 * SIZE
+   LD b6,  BO,  13 * SIZE
+   LD b7,  BO,  14 * SIZE
+   LD b8,  BO,  15 * SIZE
+   MUL c21, b2, c21
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c51, c21, b5, c51
+   NMSUB  c61, c21, b6, c61
+   NMSUB  c71, c21, b7, c71
+   NMSUB  c81, c21, b8, c81
+   LD b3,  BO,  18 * SIZE
+   LD b4,  BO,  19 * SIZE
+   LD b5,  BO,  20 * SIZE
+   LD b6,  BO,  21 * SIZE
+   LD b7,  BO,  22 * SIZE
+   LD b8,  BO,  23 * SIZE
+   MUL c31, b3, c31
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c51, c31, b5, c51
+   NMSUB  c61, c31, b6, c61
+   NMSUB  c71, c31, b7, c71
+   NMSUB  c81, c31, b8, c81
+   LD b4,  BO,  27 * SIZE
+   LD b5,  BO,  28 * SIZE
+   LD b6,  BO,  29 * SIZE
+   LD b7,  BO,  30 * SIZE
+   LD b8,  BO,  31 * SIZE
+   MUL c41, b4, c41
+   NMSUB  c51, c41, b5, c51
+   NMSUB  c61, c41, b6, c61
+   NMSUB  c71, c41, b7, c71
+   NMSUB  c81, c41, b8, c81
+   LD b5,  BO,  36 * SIZE
+   LD b6,  BO,  37 * SIZE
+   LD b7,  BO,  38 * SIZE
+   LD b8,  BO,  39 * SIZE
+   MUL c51, b5, c51
+   NMSUB  c61, c51, b6, c61
+   NMSUB  c71, c51, b7, c71
+   NMSUB  c81, c51, b8, c81
+   LD b6,  BO,  45 * SIZE
+   LD b7,  BO,  46 * SIZE
+   LD b8,  BO,  47 * SIZE
+   MUL c61, b6, c61
+   NMSUB  c71, c61, b7, c71
+   NMSUB  c81, c61, b8, c81
+   LD b7,  BO,  54 * SIZE
+   LD b8,  BO,  55 * SIZE
+   MUL c71, b7, c71
+   NMSUB  c81, c71, b8, c81
+   LD b8,  BO,  63 * SIZE
+   MUL c81, b8, c81
+#endif
+#ifdef RT
+   LD b1,  BO,  63 * SIZE
+   LD b2,  BO,  62 * SIZE
+   LD b3,  BO,  61 * SIZE
+   LD b4,  BO,  60 * SIZE
+   LD b5,  BO,  59 * SIZE
+   LD b6,  BO,  58 * SIZE
+   LD b7,  BO,  57 * SIZE
+   LD b8,  BO,  56 * SIZE
+   MUL c81, b1, c81
+   NMSUB  c71, c81, b2, c71
+   NMSUB  c61, c81, b3, c61
+   NMSUB  c51, c81, b4, c51
+   NMSUB  c41, c81, b5, c41
+   NMSUB  c31, c81, b6, c31
+   NMSUB  c21, c81, b7, c21
+   NMSUB  c11, c81, b8, c11
+   LD b2,  BO,  54 * SIZE
+   LD b3,  BO,  53 * SIZE
+   LD b4,  BO,  52 * SIZE
+   LD b5,  BO,  51 * SIZE
+   LD b6,  BO,  50 * SIZE
+   LD b7,  BO,  49 * SIZE
+   LD b8,  BO,  48 * SIZE
+   MUL c71, b2, c71
+   NMSUB  c61, c71, b3, c61
+   NMSUB  c51, c71, b4, c51
+   NMSUB  c41, c71, b5, c41
+   NMSUB  c31, c71, b6, c31
+   NMSUB  c21, c71, b7, c21
+   NMSUB  c11, c71, b8, c11
+   LD b3,  BO,  45 * SIZE
+   LD b4,  BO,  44 * SIZE
+   LD b5,  BO,  43 * SIZE
+   LD b6,  BO,  42 * SIZE
+   LD b7,  BO,  41 * SIZE
+   LD b8,  BO,  40 * SIZE
+   MUL c61, b3, c61
+   NMSUB  c51, c61, b4, c51
+   NMSUB  c41, c61, b5, c41
+   NMSUB  c31, c61, b6, c31
+   NMSUB  c21, c61, b7, c21
+   NMSUB  c11, c61, b8, c11
+   LD b4,  BO,  36 * SIZE
+   LD b5,  BO,  35 * SIZE
+   LD b6,  BO,  34 * SIZE
+   LD b7,  BO,  33 * SIZE
+   LD b8,  BO,  32 * SIZE
+   MUL c51, b4, c51
+   NMSUB  c41, c51, b5, c41
+   NMSUB  c31, c51, b6, c31
+   NMSUB  c21, c51, b7, c21
+   NMSUB  c11, c51, b8, c11
+   LD b5,  BO,  27 * SIZE
+   LD b6,  BO,  26 * SIZE
+   LD b7,  BO,  25 * SIZE
+   LD b8,  BO,  24 * SIZE
+   MUL c41, b5, c41
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c11, c41, b8, c11
+   LD b6,  BO,  18 * SIZE
+   LD b7,  BO,  17 * SIZE
+   LD b8,  BO,  16 * SIZE
+   MUL c31, b6, c31
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c11, c31, b8, c11
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c21, b7, c21
+   NMSUB  c11, c21, b8, c11
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+   addi.d  CO3, CO3, -1 * SIZE
+   addi.d  CO4, CO4, -1 * SIZE
+   addi.d  CO5, CO5, -1 * SIZE
+   addi.d  CO6, CO6, -1 * SIZE
+   addi.d  CO7, CO7, -1 * SIZE
+   addi.d  CO8, CO8, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c61,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c81,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c41,  AO,   3 * SIZE
+   ST c51,  AO,   4 * SIZE
+   ST c61,  AO,   5 * SIZE
+   ST c71,  AO,   6 * SIZE
+   ST c81,  AO,   7 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c81,  CO8,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+   addi.d  CO3, CO3, 1 * SIZE
+   addi.d  CO4, CO4, 1 * SIZE
+   addi.d  CO5, CO5, 1 * SIZE
+   addi.d  CO6, CO6, 1 * SIZE
+   addi.d  CO7, CO7, 1 * SIZE
+   addi.d  CO8, CO8, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L29:
+#ifdef LN
+   slli.d  TEMP, K, 3 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  8
+#endif
+#ifdef RT
+   addi.d  KK, KK, -8
+#endif
+   blt $r0,    J, .L10
+   .align 3
+
+.L30:
+   andi    J,  N, 4
+move   AO, A
+   bge $r0,    J, .L50
+#ifdef RT
+   slli.d  TEMP, K, 2 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 2
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   srai.d  I,  M, 1
+   MOV c31, c11
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO4,    LDC
+#endif
+MOV    c41, c11
+   bge $r0,    I, .L40
+.L31:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   MOV c32, c11
+   LD b4,  B,   3 * SIZE
+   MOV c42, c11
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L35
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c32, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c42, c11
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c11, b7, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c31, b3, a3, c31
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD  c12, b7, a2, c12
+   LD b7,  BO,  12 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   0 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   4 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L36
+.L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c12, b5, c12
+   SUB c22, b6, c22
+   SUB c32, b7, c32
+   SUB c42, b8, c42
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c21, b3, c21
+   SUB c22, b4, c22
+   SUB c31, b5, c31
+   SUB c32, b6, c32
+   SUB c41, b7, c41
+   SUB c42, b8, c42
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   MUL c22, b1, c22
+   MUL c32, b1, c32
+   MUL c42, b1, c42
+   NMSUB  c11, c12, b2, c11
+   NMSUB  c21, c22, b2, c21
+   NMSUB  c31, c32, b2, c31
+   NMSUB  c41, c42, b2, c41
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+   MUL c31, b3, c31
+   MUL c41, b3, c41
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   NMSUB  c12, c11, b2, c12
+   NMSUB  c22, c21, b2, c22
+   NMSUB  c32, c31, b2, c32
+   NMSUB  c42, c41, b2, c42
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+   MUL c32, b3, c32
+   MUL c42, b3, c42
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c32, c12, b3, c32
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c42, c12, b4, c42
+   LD b2,  BO,   5 * SIZE
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL c21, b2, c21
+   MUL c22, b2, c22
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c32, c22, b3, c32
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c42, c22, b4, c42
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   MUL c31, b3, c31
+   MUL c32, b3, c32
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c42, c32, b4, c42
+   LD b4,  BO,  15 * SIZE
+   MUL c41, b4, c41
+   MUL c42, b4, c42
+#endif
+#ifdef RT
+   LD b5,  BO,  15 * SIZE
+   LD b6,  BO,  14 * SIZE
+   LD b7,  BO,  13 * SIZE
+   LD b8,  BO,  12 * SIZE
+   MUL c41, b5, c41
+   MUL c42, b5, c42
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c32, c42, b6, c32
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c22, c42, b7, c22
+   NMSUB  c11, c41, b8, c11
+   NMSUB  c12, c42, b8, c12
+   LD b6,  BO,  10 * SIZE
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c31, b6, c31
+   MUL c32, b6, c32
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c22, c32, b7, c22
+   NMSUB  c11, c31, b8, c11
+   NMSUB  c12, c32, b8, c12
+   LD b7,  BO,   5 * SIZE
+   LD b8,  BO,   4 * SIZE
+   MUL c21, b7, c21
+   MUL c22, b7, c22
+   NMSUB  c11, c21, b8, c11
+   NMSUB  c12, c22, b8, c12
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+   MUL c12, b8, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+   addi.d  CO2, CO2, -2 * SIZE
+   addi.d  CO3, CO3, -2 * SIZE
+   addi.d  CO4, CO4, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c12,  BO,   4 * SIZE
+   ST c22,  BO,   5 * SIZE
+   ST c32,  BO,   6 * SIZE
+   ST c42,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+   ST c31,  AO,   4 * SIZE
+   ST c32,  AO,   5 * SIZE
+   ST c41,  AO,   6 * SIZE
+   ST c42,  AO,   7 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c32,  CO3,   1 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c42,  CO4,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+   addi.d  CO3, CO3, 2 * SIZE
+   addi.d  CO4, CO4, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+MTC  a1, $r0
+   MOV c11, a1
+   MOV c21, a1
+   MOV c31, a1
+   addi.d  I, I, -1
+MOV    c41, c11
+   blt $r0,    I, .L31
+   .align 3
+
+.L40:
+   andi    I,  M, 1
+MOV    c61, c11
+   bge $r0,    I, .L49
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+move   BO,  B
+   bge $r0,    L, .L45
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L45
+#endif
+   .align  3
+.L42:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b5, a2, c11
+   LD b5,  BO,  20 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  11 * SIZE
+   LD a2,  AO,   2 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   LD a2,  AO,  -1 * SIZE
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c11, b7, a2, c11
+   LD b7,  BO,  12 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   1 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   2 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   3 * SIZE
+   LD a2,  AO,   1 * SIZE
+   blt $r0,    L, .L42
+   .align 3
+
+.L45:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L48
+   .align  3
+.L46:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   1 * SIZE
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+   MOV a2, a2
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L46
+.L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c41, c11, b4, c41
+   LD b2,  BO,   5 * SIZE
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL c21, b2, c21
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c41, c21, b4, c41
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   MUL c31, b3, c31
+   NMSUB  c41, c31, b4, c41
+   LD b4,  BO,  15 * SIZE
+   MUL c41, b4, c41
+#endif
+#ifdef RT
+   LD b5,  BO,  15 * SIZE
+   LD b6,  BO,  14 * SIZE
+   LD b7,  BO,  13 * SIZE
+   LD b8,  BO,  12 * SIZE
+   MUL c41, b5, c41
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c11, c41, b8, c11
+   LD b6,  BO,  10 * SIZE
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c31, b6, c31
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c11, c31, b8, c11
+   LD b7,  BO,   5 * SIZE
+   LD b8,  BO,   4 * SIZE
+   MUL c21, b7, c21
+   NMSUB  c11, c21, b8, c11
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+   addi.d  CO3, CO3, -1 * SIZE
+   addi.d  CO4, CO4, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c41,  AO,   3 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+   addi.d  CO3, CO3, 1 * SIZE
+   addi.d  CO4, CO4, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L49:
+#ifdef LN
+   slli.d  TEMP, K, 2 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  4
+#endif
+#ifdef RT
+   addi.d  KK, KK, -4
+#endif
+   .align 3
+
+.L50:
+   andi    J,  N, 2
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+#else
+   move    AO, A
+#endif
+   bge $r0,    J, .L70
+#ifdef RT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 1
+   sub.d   C, C, TEMP
+#endif
+   move    AO, A
+   move    CO1, C
+   add.d   CO2, C,      LDC
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO2,    LDC
+#endif
+   srai.d  I,  M, 1
+   bge $r0,    I, .L60
+.L51:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L55
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L55
+#endif
+   .align  3
+.L52:
+   MADD  c11, b1, a1, c11
+   LD a3,  AO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b4,  BO,   3 * SIZE
+   MADD  c12, b1, a2, c12
+   LD a4,  AO,   3 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b1,  BO,   8 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a1,  AO,   8 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   5 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,   6 * SIZE
+   MADD  c11, b5, a5, c11
+   LD a3,  AO,   6 * SIZE
+   MADD  c21, b2, a5, c21
+   LD b4,  BO,   7 * SIZE
+   MADD  c12, b5, a2, c12
+   LD a4,  AO,   7 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b5,  BO,  12 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a5,  AO,  12 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   9 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,  10 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L52
+   .align 3
+
+.L55:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L58
+   .align  3
+.L56:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   3 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L56
+.L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c12, b3, c12
+   SUB c22, b4, c22
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c21, b3, c21
+   SUB c22, b4, c22
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   MUL c22, b1, c22
+   NMSUB  c11, c12, b2, c11
+   NMSUB  c21, c22, b2, c21
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   NMSUB  c12, c11, b2, c12
+   NMSUB  c22, c21, b2, c22
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   MUL c21, b3, c21
+   MUL c22, b3, c22
+#endif
+#ifdef RT
+   LD b1,  BO,   3 * SIZE
+   LD b2,  BO,   2 * SIZE
+   LD b3,  BO,   0 * SIZE
+   MUL c21, b1, c21
+   MUL c22, b1, c22
+   NMSUB  c11, c21, b2, c11
+   NMSUB  c12, c22, b2, c12
+   MUL c11, b3, c11
+   MUL c12, b3, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+   addi.d  CO2, CO2, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c12,  BO,   2 * SIZE
+   ST c22,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+MTC  a1, $r0
+   MOV c11, a1
+   MOV c21, a1
+   MOV c31, a1
+   addi.d  I, I, -1
+MOV    c41, c11
+   blt $r0,    I, .L51
+   .align 3
+
+.L60:
+   andi    I,  M, 1
+   bge $r0,    I, .L69
+#if defined(LT) || defined(RN)
+   srai.d  L,  KK, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L65
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   srai.d  L,  TEMP, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L65
+#endif
+   .align  3
+.L62:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  11 * SIZE
+   LD a3,  AO,   6 * SIZE
+   LD a4,  AO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L62
+   .align 3
+
+.L65:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L68
+   .align  3
+.L66:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   3 * SIZE
+   LD a1,  AO,   1 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L66
+.L68:
+   ADD c11, c11, c31
+   ADD c21, c21, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+#endif
+#if defined(LN) || defined(LT)
+   LD b3,  AO,   0 * SIZE
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   MUL c21, b3, c21
+#endif
+#ifdef RT
+   LD b1,  BO,   3 * SIZE
+   LD b2,  BO,   2 * SIZE
+   LD b3,  BO,   0 * SIZE
+   MUL c21, b1, c21
+   NMSUB  c11, c21, b2, c11
+   MUL c11, b3, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 0 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L69:
+#ifdef LN
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  2
+#endif
+#ifdef RT
+   addi.d  KK, KK, -2
+#endif
+   .align 3
+
+.L70:
+   andi    J,  N, 1
+   bge $r0,    J, .L999
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   sub.d   B, B, TEMP
+   sub.d   C, C,    LDC
+#endif
+   move    AO, A
+   move    CO1, C
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO1,    LDC
+#endif
+   srai.d  I,  M, 1
+   bge $r0,    I, .L80
+.L71:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L75
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 0 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L75
+#endif
+   .align  3
+.L72:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   2 * SIZE
+   LD a2,  AO,   3 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   6 * SIZE
+   LD a2,  AO,   7 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  8 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L72
+   .align 3
+
+.L75:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L78
+   .align  3
+.L76:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L76
+.L78:
+   ADD c11, c11, c21
+   ADD c12, c12, c22
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -1
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   NMSUB  c11, c12, b2, c11
+   MUL c11, b3, c11
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c12, c11, b2, c12
+   MUL c12, b3, c12
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L71
+   .align 3
+
+.L80:
+   andi    I,  M, 1
+   bge $r0,    I, .L89
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+move   BO,  B
+   bge $r0,    L, .L85
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  TEMP, KK, BASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L85
+#endif
+   .align  3
+.L82:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   1 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c21, b1, a1, c21
+   LD a1,  AO,   2 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   3 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c21, b1, a1, c21
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L82
+   .align 3
+
+.L85:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L88
+   .align  3
+.L86:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L86
+.L88:
+   ADD c11, c11, c21
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -1
+#endif
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   SUB c11, b1, c11
+#else
+   LD b1,  AO,   0 * SIZE
+   SUB c11, b1, c11
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   MUL c11, b1, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L89:
+#ifdef LN
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  1
+#endif
+#ifdef RT
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   fld.d  $f24,  $sp,  48
+   fld.d  $f25,  $sp,  56
+   fld.d  $f26,  $sp,  64
+   fld.d  $f27,  $sp,  72
+   fld.d  $f28,  $sp,  80
+   LDARG  $r29,  $sp,   88
+   LDARG  $r30,  $sp,   96
+   LDARG  $r20,  $sp,  104
+   LDARG  $r16,  $sp,  112
+#ifndef __64BIT__
+   fld.d  $f18,  $sp, 112
+   fld.d  $f19,  $sp, 120
+   fld.d  $f20,  $sp, 128
+   fld.d  $f21,  $sp, 136
+#endif
+   addi.d  $sp, $sp, 144
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/trsm_kernel_RT.S b/kernel/loongarch64/trsm_kernel_RT.S
new file mode 100644
index 000000000..c86d9c1e5
--- /dev/null
+++ b/kernel/loongarch64/trsm_kernel_RT.S
@@ -0,0 +1,2850 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+#define OFFSET $r11
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r29
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define CO5    $r25
+#define CO6    $r26
+#define CO7    $r27
+#define CO8    $r28
+#define KK     $r30
+#define TEMP   $r20
+#define AORIG  $r16
+#define a1     $f22
+#define a2     $f8
+#define a3     $f27
+#define a4     $f28
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f3
+#define c22    $f1
+#define c31    $f2
+#define c32    $f4
+#define c41    $f5
+#define c42    $f6
+#define c51    $f7
+#define c52    $f18
+#define c61    $f19
+#define c62    $f20
+#define c71    $f21
+#define c72    $f24
+#define c81    $f25
+#define c82    $f26
+#define ALPHA  $f0
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -144
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   fst.d  $f24,  $sp,  48
+   fst.d  $f25,  $sp,  56
+   fst.d  $f26,  $sp,  64
+   fst.d  $f27,  $sp,  72
+   fst.d  $f28,  $sp,  80
+   SDARG  $r29,  $sp,   88
+   SDARG  $r30,  $sp,   96
+   SDARG  $r20,  $sp,  104
+   SDARG  $r16,  $sp,  112
+#ifndef __64BIT__
+   fst.d  $f18,  $sp, 112
+   fst.d  $f19,  $sp, 120
+   fst.d  $f20,  $sp, 128
+   fst.d  $f21,  $sp, 136
+#endif
+   slli.d     LDC,    LDC, BASE_SHIFT
+#ifdef LN
+        mul.w   TEMP, M, K
+   slli.d  TEMP, TEMP, BASE_SHIFT
+   add.d   A, A, TEMP
+   slli.d  TEMP, M, BASE_SHIFT
+   add.d   C, C, TEMP
+#endif
+#ifdef RN
+        sub.d   KK, $r0, OFFSET
+#endif
+#ifdef RT
+        mul.w   TEMP, N, K
+   slli.d  TEMP, TEMP, BASE_SHIFT
+   add.d   B, B, TEMP
+        mul.w   TEMP, N,    LDC
+   add.d   C, C, TEMP
+   sub.d   KK, N, OFFSET
+#endif
+   andi    J,  N, 1
+   bge $r0,    J, .L30
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   sub.d   B, B, TEMP
+   sub.d   C, C,    LDC
+#endif
+   move    AO, A
+   move    CO1, C
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO1,    LDC
+#endif
+   srai.d  I,  M, 1
+   bge $r0,    I, .L80
+.L71:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L75
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 0 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L75
+#endif
+   .align  3
+.L72:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   2 * SIZE
+   LD a2,  AO,   3 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   6 * SIZE
+   LD a2,  AO,   7 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  8 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L72
+   .align 3
+
+.L75:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L78
+   .align  3
+.L76:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L76
+.L78:
+   ADD c11, c11, c21
+   ADD c12, c12, c22
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -1
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   NMSUB  c11, c12, b2, c11
+   MUL c11, b3, c11
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c12, c11, b2, c12
+   MUL c12, b3, c12
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L71
+   .align 3
+
+.L80:
+   andi    I,  M, 1
+   bge $r0,    I, .L89
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   MOV c21, c11
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+move   BO,  B
+   bge $r0,    L, .L85
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  TEMP, KK, BASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MOV c21, c11
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L85
+#endif
+   .align  3
+.L82:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   1 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c21, b1, a1, c21
+   LD a1,  AO,   2 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   3 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c21, b1, a1, c21
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L82
+   .align 3
+
+.L85:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L88
+   .align  3
+.L86:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L86
+.L88:
+   ADD c11, c11, c21
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -1
+#endif
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   SUB c11, b1, c11
+#else
+   LD b1,  AO,   0 * SIZE
+   SUB c11, b1, c11
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   MUL c11, b1, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L89:
+#ifdef LN
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  1
+#endif
+#ifdef RT
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L30:
+   andi    J,  N, 2
+   bge $r0,    J, .L50
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 1
+   sub.d   C, C, TEMP
+#endif
+   move    AO, A
+   move    CO1, C
+   add.d   CO2, C,      LDC
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO2,    LDC
+#endif
+   srai.d  I,  M, 1
+   bge $r0,    I, .L60
+.L51:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L55
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L55
+#endif
+   .align  3
+.L52:
+   MADD  c11, b1, a1, c11
+   LD a3,  AO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b4,  BO,   3 * SIZE
+   MADD  c12, b1, a2, c12
+   LD a4,  AO,   3 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b1,  BO,   8 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a1,  AO,   8 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   5 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,   6 * SIZE
+   MADD  c11, b5, a5, c11
+   LD a3,  AO,   6 * SIZE
+   MADD  c21, b2, a5, c21
+   LD b4,  BO,   7 * SIZE
+   MADD  c12, b5, a2, c12
+   LD a4,  AO,   7 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b5,  BO,  12 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a5,  AO,  12 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   9 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,  10 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L52
+   .align 3
+
+.L55:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L58
+   .align  3
+.L56:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   3 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L56
+.L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c12, b3, c12
+   SUB c22, b4, c22
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c21, b3, c21
+   SUB c22, b4, c22
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   MUL c22, b1, c22
+   NMSUB  c11, c12, b2, c11
+   NMSUB  c21, c22, b2, c21
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   NMSUB  c12, c11, b2, c12
+   NMSUB  c22, c21, b2, c22
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   MUL c21, b3, c21
+   MUL c22, b3, c22
+#endif
+#ifdef RT
+   LD b1,  BO,   3 * SIZE
+   LD b2,  BO,   2 * SIZE
+   LD b3,  BO,   0 * SIZE
+   MUL c21, b1, c21
+   MUL c22, b1, c22
+   NMSUB  c11, c21, b2, c11
+   NMSUB  c12, c22, b2, c12
+   MUL c11, b3, c11
+   MUL c12, b3, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+   addi.d  CO2, CO2, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c12,  BO,   2 * SIZE
+   ST c22,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+MTC  a1, $r0
+   MOV c11, a1
+   MOV c21, a1
+   MOV c31, a1
+   addi.d  I, I, -1
+MOV    c41, c11
+   blt $r0,    I, .L51
+   .align 3
+
+.L60:
+   andi    I,  M, 1
+   bge $r0,    I, .L69
+#if defined(LT) || defined(RN)
+   srai.d  L,  KK, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L65
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   srai.d  L,  TEMP, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L65
+#endif
+   .align  3
+.L62:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  11 * SIZE
+   LD a3,  AO,   6 * SIZE
+   LD a4,  AO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L62
+   .align 3
+
+.L65:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L68
+   .align  3
+.L66:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   3 * SIZE
+   LD a1,  AO,   1 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L66
+.L68:
+   ADD c11, c11, c31
+   ADD c21, c21, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+#endif
+#if defined(LN) || defined(LT)
+   LD b3,  AO,   0 * SIZE
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   MUL c21, b3, c21
+#endif
+#ifdef RT
+   LD b1,  BO,   3 * SIZE
+   LD b2,  BO,   2 * SIZE
+   LD b3,  BO,   0 * SIZE
+   MUL c21, b1, c21
+   NMSUB  c11, c21, b2, c11
+   MUL c11, b3, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 0 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L69:
+#ifdef LN
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  2
+#endif
+#ifdef RT
+   addi.d  KK, KK, -2
+#endif
+   .align 3
+
+.L50:
+   andi    J,  N, 4
+move   AO, A
+   bge $r0,    J, .L70
+#ifdef RT
+   slli.d  TEMP, K, 2 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 2
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   srai.d  I,  M, 1
+   MOV c31, c11
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO4,    LDC
+#endif
+MOV    c41, c11
+   bge $r0,    I, .L40
+.L31:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   MOV c32, c11
+   LD b4,  B,   3 * SIZE
+   MOV c42, c11
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L35
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c32, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c42, c11
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c11, b7, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c31, b3, a3, c31
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD  c12, b7, a2, c12
+   LD b7,  BO,  12 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   0 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   4 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L36
+.L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c12, b5, c12
+   SUB c22, b6, c22
+   SUB c32, b7, c32
+   SUB c42, b8, c42
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c21, b3, c21
+   SUB c22, b4, c22
+   SUB c31, b5, c31
+   SUB c32, b6, c32
+   SUB c41, b7, c41
+   SUB c42, b8, c42
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   MUL c22, b1, c22
+   MUL c32, b1, c32
+   MUL c42, b1, c42
+   NMSUB  c11, c12, b2, c11
+   NMSUB  c21, c22, b2, c21
+   NMSUB  c31, c32, b2, c31
+   NMSUB  c41, c42, b2, c41
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+   MUL c31, b3, c31
+   MUL c41, b3, c41
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   NMSUB  c12, c11, b2, c12
+   NMSUB  c22, c21, b2, c22
+   NMSUB  c32, c31, b2, c32
+   NMSUB  c42, c41, b2, c42
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+   MUL c32, b3, c32
+   MUL c42, b3, c42
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c32, c12, b3, c32
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c42, c12, b4, c42
+   LD b2,  BO,   5 * SIZE
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL c21, b2, c21
+   MUL c22, b2, c22
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c32, c22, b3, c32
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c42, c22, b4, c42
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   MUL c31, b3, c31
+   MUL c32, b3, c32
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c42, c32, b4, c42
+   LD b4,  BO,  15 * SIZE
+   MUL c41, b4, c41
+   MUL c42, b4, c42
+#endif
+#ifdef RT
+   LD b5,  BO,  15 * SIZE
+   LD b6,  BO,  14 * SIZE
+   LD b7,  BO,  13 * SIZE
+   LD b8,  BO,  12 * SIZE
+   MUL c41, b5, c41
+   MUL c42, b5, c42
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c32, c42, b6, c32
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c22, c42, b7, c22
+   NMSUB  c11, c41, b8, c11
+   NMSUB  c12, c42, b8, c12
+   LD b6,  BO,  10 * SIZE
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c31, b6, c31
+   MUL c32, b6, c32
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c22, c32, b7, c22
+   NMSUB  c11, c31, b8, c11
+   NMSUB  c12, c32, b8, c12
+   LD b7,  BO,   5 * SIZE
+   LD b8,  BO,   4 * SIZE
+   MUL c21, b7, c21
+   MUL c22, b7, c22
+   NMSUB  c11, c21, b8, c11
+   NMSUB  c12, c22, b8, c12
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+   MUL c12, b8, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+   addi.d  CO2, CO2, -2 * SIZE
+   addi.d  CO3, CO3, -2 * SIZE
+   addi.d  CO4, CO4, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c12,  BO,   4 * SIZE
+   ST c22,  BO,   5 * SIZE
+   ST c32,  BO,   6 * SIZE
+   ST c42,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+   ST c31,  AO,   4 * SIZE
+   ST c32,  AO,   5 * SIZE
+   ST c41,  AO,   6 * SIZE
+   ST c42,  AO,   7 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c32,  CO3,   1 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c42,  CO4,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+   addi.d  CO3, CO3, 2 * SIZE
+   addi.d  CO4, CO4, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+MTC  a1, $r0
+   MOV c11, a1
+   MOV c21, a1
+   MOV c31, a1
+   addi.d  I, I, -1
+MOV    c41, c11
+   blt $r0,    I, .L31
+   .align 3
+
+.L40:
+   andi    I,  M, 1
+MOV    c61, c11
+   bge $r0,    I, .L49
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+move   BO,  B
+   bge $r0,    L, .L45
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L45
+#endif
+   .align  3
+.L42:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b5, a2, c11
+   LD b5,  BO,  20 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  11 * SIZE
+   LD a2,  AO,   2 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   LD a2,  AO,  -1 * SIZE
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c11, b7, a2, c11
+   LD b7,  BO,  12 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   1 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   2 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   3 * SIZE
+   LD a2,  AO,   1 * SIZE
+   blt $r0,    L, .L42
+   .align 3
+
+.L45:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L48
+   .align  3
+.L46:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   1 * SIZE
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+   MOV a2, a2
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L46
+.L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c41, c11, b4, c41
+   LD b2,  BO,   5 * SIZE
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL c21, b2, c21
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c41, c21, b4, c41
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   MUL c31, b3, c31
+   NMSUB  c41, c31, b4, c41
+   LD b4,  BO,  15 * SIZE
+   MUL c41, b4, c41
+#endif
+#ifdef RT
+   LD b5,  BO,  15 * SIZE
+   LD b6,  BO,  14 * SIZE
+   LD b7,  BO,  13 * SIZE
+   LD b8,  BO,  12 * SIZE
+   MUL c41, b5, c41
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c11, c41, b8, c11
+   LD b6,  BO,  10 * SIZE
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c31, b6, c31
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c11, c31, b8, c11
+   LD b7,  BO,   5 * SIZE
+   LD b8,  BO,   4 * SIZE
+   MUL c21, b7, c21
+   NMSUB  c11, c21, b8, c11
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+   addi.d  CO3, CO3, -1 * SIZE
+   addi.d  CO4, CO4, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c41,  AO,   3 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+   addi.d  CO3, CO3, 1 * SIZE
+   addi.d  CO4, CO4, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L49:
+#ifdef LN
+   slli.d  TEMP, K, 2 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  4
+#endif
+#ifdef RT
+   addi.d  KK, KK, -4
+#endif
+   .align 3
+
+.L70:
+   srai.d  J,  N, 3
+nop
+   bge $r0,    J, .L999
+.L10:
+#ifdef RT
+   slli.d  TEMP, K, 3 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 3
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   CO5, CO4,    LDC
+   MOV c31, c11
+   add.d   CO6, CO5,    LDC
+   MOV c41, c11
+   add.d   CO7, CO6,    LDC
+   MOV c51, c11
+   add.d   CO8, CO7,    LDC
+   srai.d  I,  M, 1
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO8,    LDC
+#endif
+MOV    c61, c11
+   bge $r0,    I, .L20
+.L11:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  KK, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L15
+#endif
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -8
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   LD b5,  BO,   4 * SIZE
+   SUB c21, b2, c21
+   LD b6,  BO,   5 * SIZE
+   SUB c31, b3, c31
+   LD b7,  BO,   6 * SIZE
+   SUB c41, b4, c41
+   LD b8,  BO,   7 * SIZE
+   SUB c51, b5, c51
+   LD b1,  BO,   8 * SIZE
+   SUB c61, b6, c61
+   LD b2,  BO,   9 * SIZE
+   SUB c71, b7, c71
+   LD b3,  BO,  10 * SIZE
+   SUB c81, b8, c81
+   LD b4,  BO,  11 * SIZE
+   SUB c12, b1, c12
+   LD b5,  BO,  12 * SIZE
+   SUB c22, b2, c22
+   LD b6,  BO,  13 * SIZE
+   SUB c32, b3, c32
+   LD b7,  BO,  14 * SIZE
+   SUB c42, b4, c42
+   LD b8,  BO,  15 * SIZE
+   SUB c52, b5, c52
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+#else
+   LD b1,  AO,   0 * SIZE
+#endif
+   SUB c62, b6, c62
+   SUB c72, b7, c72
+   SUB c82, b8, c82
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   LD b5,  AO,   4 * SIZE
+   SUB c12, b2, c12
+   LD b6,  AO,   5 * SIZE
+   SUB c21, b3, c21
+   LD b7,  AO,   6 * SIZE
+   SUB c22, b4, c22
+   LD b8,  AO,   7 * SIZE
+   SUB c31, b5, c31
+   LD b1,  AO,   8 * SIZE
+   SUB c32, b6, c32
+   LD b2,  AO,   9 * SIZE
+   SUB c41, b7, c41
+   LD b3,  AO,  10 * SIZE
+   SUB c42, b8, c42
+   LD b4,  AO,  11 * SIZE
+   LD b5,  AO,  12 * SIZE
+   SUB c51, b1, c51
+   LD b6,  AO,  13 * SIZE
+   SUB c52, b2, c52
+   LD b7,  AO,  14 * SIZE
+   SUB c61, b3, c61
+   LD b8,  AO,  15 * SIZE
+   SUB c62, b4, c62
+   SUB c71, b5, c71
+   SUB c72, b6, c72
+   SUB c81, b7, c81
+   SUB c82, b8, c82
+#endif
+#ifdef LN
+   MUL c12, b1, c12
+   LD b2,  AO,   2 * SIZE
+   MUL c22, b1, c22
+   MUL c32, b1, c32
+   MUL c42, b1, c42
+   MUL c52, b1, c52
+   MUL c62, b1, c62
+   MUL c72, b1, c72
+   MUL c82, b1, c82
+   NMSUB  c11, c12, b2, c11
+   LD b3,  AO,   0 * SIZE
+   NMSUB  c21, c22, b2, c21
+   NMSUB  c31, c32, b2, c31
+   NMSUB  c41, c42, b2, c41
+   NMSUB  c51, c52, b2, c51
+   NMSUB  c61, c62, b2, c61
+   NMSUB  c71, c72, b2, c71
+   NMSUB  c81, c82, b2, c81
+   MUL c11, b3, c11
+   addi.d  CO1, CO1, -2 * SIZE
+   MUL c21, b3, c21
+   addi.d  CO2, CO2, -2 * SIZE
+   MUL c31, b3, c31
+   addi.d  CO3, CO3, -2 * SIZE
+   MUL c41, b3, c41
+   addi.d  CO4, CO4, -2 * SIZE
+   MUL c51, b3, c51
+   addi.d  CO5, CO5, -2 * SIZE
+   MUL c61, b3, c61
+   addi.d  CO6, CO6, -2 * SIZE
+   MUL c71, b3, c71
+   addi.d  CO7, CO7, -2 * SIZE
+   MUL c81, b3, c81
+   addi.d  CO8, CO8, -2 * SIZE
+#endif
+#ifdef LT
+   MUL c11, b1, c11
+   LD b2,  AO,   1 * SIZE
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   MUL c51, b1, c51
+   MUL c61, b1, c61
+   MUL c71, b1, c71
+   MUL c81, b1, c81
+   NMSUB  c12, c11, b2, c12
+   LD b3,  AO,   3 * SIZE
+   NMSUB  c22, c21, b2, c22
+   NMSUB  c32, c31, b2, c32
+   NMSUB  c42, c41, b2, c42
+   NMSUB  c52, c51, b2, c52
+   NMSUB  c62, c61, b2, c62
+   NMSUB  c72, c71, b2, c72
+   NMSUB  c82, c81, b2, c82
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+   MUL c32, b3, c32
+   MUL c42, b3, c42
+   MUL c52, b3, c52
+   MUL c62, b3, c62
+   MUL c72, b3, c72
+   MUL c82, b3, c82
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   LD b5,  BO,   4 * SIZE
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   LD b6,  BO,   5 * SIZE
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c32, c12, b3, c32
+   LD b7,  BO,   6 * SIZE
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c42, c12, b4, c42
+   LD b8,  BO,   7 * SIZE
+   NMSUB  c51, c11, b5, c51
+   NMSUB  c52, c12, b5, c52
+   LD b2,  BO,   9 * SIZE
+   NMSUB  c61, c11, b6, c61
+   NMSUB  c62, c12, b6, c62
+   LD b3,  BO,  10 * SIZE
+   NMSUB  c71, c11, b7, c71
+   NMSUB  c72, c12, b7, c72
+   LD b4,  BO,  11 * SIZE
+   NMSUB  c81, c11, b8, c81
+   NMSUB  c82, c12, b8, c82
+   LD b5,  BO,  12 * SIZE
+   MUL c21, b2, c21
+   MUL c22, b2, c22
+   LD b6,  BO,  13 * SIZE
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c32, c22, b3, c32
+   LD b7,  BO,  14 * SIZE
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c42, c22, b4, c42
+   LD b8,  BO,  15 * SIZE
+   NMSUB  c51, c21, b5, c51
+   NMSUB  c52, c22, b5, c52
+   LD b3,  BO,  18 * SIZE
+   NMSUB  c61, c21, b6, c61
+   NMSUB  c62, c22, b6, c62
+   LD b4,  BO,  19 * SIZE
+   NMSUB  c71, c21, b7, c71
+   NMSUB  c72, c22, b7, c72
+   LD b5,  BO,  20 * SIZE
+   NMSUB  c81, c21, b8, c81
+   NMSUB  c82, c22, b8, c82
+   LD b6,  BO,  21 * SIZE
+   MUL c31, b3, c31
+   MUL c32, b3, c32
+   LD b7,  BO,  22 * SIZE
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c42, c32, b4, c42
+   LD b8,  BO,  23 * SIZE
+   NMSUB  c51, c31, b5, c51
+   NMSUB  c52, c32, b5, c52
+   LD b4,  BO,  27 * SIZE
+   NMSUB  c61, c31, b6, c61
+   NMSUB  c62, c32, b6, c62
+   LD b5,  BO,  28 * SIZE
+   NMSUB  c71, c31, b7, c71
+   NMSUB  c72, c32, b7, c72
+   LD b6,  BO,  29 * SIZE
+   NMSUB  c81, c31, b8, c81
+   NMSUB  c82, c32, b8, c82
+   LD b7,  BO,  30 * SIZE
+   MUL c41, b4, c41
+   MUL c42, b4, c42
+   LD b8,  BO,  31 * SIZE
+   NMSUB  c51, c41, b5, c51
+   NMSUB  c52, c42, b5, c52
+   LD b5,  BO,  36 * SIZE
+   NMSUB  c61, c41, b6, c61
+   NMSUB  c62, c42, b6, c62
+   LD b6,  BO,  37 * SIZE
+   NMSUB  c71, c41, b7, c71
+   NMSUB  c72, c42, b7, c72
+   LD b7,  BO,  38 * SIZE
+   NMSUB  c81, c41, b8, c81
+   NMSUB  c82, c42, b8, c82
+   LD b8,  BO,  39 * SIZE
+   MUL c51, b5, c51
+   MUL c52, b5, c52
+   NMSUB  c61, c51, b6, c61
+   NMSUB  c62, c52, b6, c62
+   LD b6,  BO,  45 * SIZE
+   NMSUB  c71, c51, b7, c71
+   NMSUB  c72, c52, b7, c72
+   LD b7,  BO,  46 * SIZE
+   NMSUB  c81, c51, b8, c81
+   NMSUB  c82, c52, b8, c82
+   LD b8,  BO,  47 * SIZE
+   MUL c61, b6, c61
+   MUL c62, b6, c62
+   NMSUB  c71, c61, b7, c71
+   NMSUB  c72, c62, b7, c72
+   LD b7,  BO,  54 * SIZE
+   NMSUB  c81, c61, b8, c81
+   NMSUB  c82, c62, b8, c82
+   LD b8,  BO,  55 * SIZE
+   MUL c71, b7, c71
+   MUL c72, b7, c72
+   NMSUB  c81, c71, b8, c81
+   NMSUB  c82, c72, b8, c82
+   LD b8,  BO,  63 * SIZE
+   MUL c81, b8, c81
+   MUL c82, b8, c82
+#endif
+#ifdef RT
+   LD b1,  BO,  63 * SIZE
+   LD b2,  BO,  62 * SIZE
+   LD b3,  BO,  61 * SIZE
+   LD b4,  BO,  60 * SIZE
+   MUL c81, b1, c81
+   MUL c82, b1, c82
+   LD b5,  BO,  59 * SIZE
+   NMSUB  c71, c81, b2, c71
+   NMSUB  c72, c82, b2, c72
+   LD b6,  BO,  58 * SIZE
+   NMSUB  c61, c81, b3, c61
+   NMSUB  c62, c82, b3, c62
+   LD b7,  BO,  57 * SIZE
+   NMSUB  c51, c81, b4, c51
+   NMSUB  c52, c82, b4, c52
+   LD b8,  BO,  56 * SIZE
+   NMSUB  c41, c81, b5, c41
+   NMSUB  c42, c82, b5, c42
+   LD b2,  BO,  54 * SIZE
+   NMSUB  c31, c81, b6, c31
+   NMSUB  c32, c82, b6, c32
+   LD b3,  BO,  53 * SIZE
+   NMSUB  c21, c81, b7, c21
+   NMSUB  c22, c82, b7, c22
+   LD b4,  BO,  52 * SIZE
+   NMSUB  c11, c81, b8, c11
+   NMSUB  c12, c82, b8, c12
+   LD b5,  BO,  51 * SIZE
+   MUL c71, b2, c71
+   MUL c72, b2, c72
+   LD b6,  BO,  50 * SIZE
+   NMSUB  c61, c71, b3, c61
+   NMSUB  c62, c72, b3, c62
+   LD b7,  BO,  49 * SIZE
+   NMSUB  c51, c71, b4, c51
+   NMSUB  c52, c72, b4, c52
+   LD b8,  BO,  48 * SIZE
+   NMSUB  c41, c71, b5, c41
+   NMSUB  c42, c72, b5, c42
+   LD b3,  BO,  45 * SIZE
+   NMSUB  c31, c71, b6, c31
+   NMSUB  c32, c72, b6, c32
+   LD b4,  BO,  44 * SIZE
+   NMSUB  c21, c71, b7, c21
+   NMSUB  c22, c72, b7, c22
+   LD b5,  BO,  43 * SIZE
+   NMSUB  c11, c71, b8, c11
+   NMSUB  c12, c72, b8, c12
+   LD b6,  BO,  42 * SIZE
+   MUL c61, b3, c61
+   MUL c62, b3, c62
+   LD b7,  BO,  41 * SIZE
+   NMSUB  c51, c61, b4, c51
+   NMSUB  c52, c62, b4, c52
+   LD b8,  BO,  40 * SIZE
+   NMSUB  c41, c61, b5, c41
+   NMSUB  c42, c62, b5, c42
+   LD b4,  BO,  36 * SIZE
+   NMSUB  c31, c61, b6, c31
+   NMSUB  c32, c62, b6, c32
+   LD b5,  BO,  35 * SIZE
+   NMSUB  c21, c61, b7, c21
+   NMSUB  c22, c62, b7, c22
+   LD b6,  BO,  34 * SIZE
+   NMSUB  c11, c61, b8, c11
+   NMSUB  c12, c62, b8, c12
+   LD b7,  BO,  33 * SIZE
+   MUL c51, b4, c51
+   MUL c52, b4, c52
+   LD b8,  BO,  32 * SIZE
+   NMSUB  c41, c51, b5, c41
+   NMSUB  c42, c52, b5, c42
+   LD b5,  BO,  27 * SIZE
+   NMSUB  c31, c51, b6, c31
+   NMSUB  c32, c52, b6, c32
+   LD b6,  BO,  26 * SIZE
+   NMSUB  c21, c51, b7, c21
+   NMSUB  c22, c52, b7, c22
+   LD b7,  BO,  25 * SIZE
+   NMSUB  c11, c51, b8, c11
+   NMSUB  c12, c52, b8, c12
+   LD b8,  BO,  24 * SIZE
+   MUL c41, b5, c41
+   MUL c42, b5, c42
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c32, c42, b6, c32
+   LD b6,  BO,  18 * SIZE
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c22, c42, b7, c22
+   LD b7,  BO,  17 * SIZE
+   NMSUB  c11, c41, b8, c11
+   NMSUB  c12, c42, b8, c12
+   LD b8,  BO,  16 * SIZE
+   MUL c31, b6, c31
+   MUL c32, b6, c32
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c22, c32, b7, c22
+   LD b7,  BO,   9 * SIZE
+   NMSUB  c11, c31, b8, c11
+   NMSUB  c12, c32, b8, c12
+   LD b8,  BO,   8 * SIZE
+   MUL c21, b7, c21
+   MUL c22, b7, c22
+   NMSUB  c11, c21, b8, c11
+   NMSUB  c12, c22, b8, c12
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+   MUL c12, b8, c12
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c61,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c81,  BO,   7 * SIZE
+   ST c12,  BO,   8 * SIZE
+   ST c22,  BO,   9 * SIZE
+   ST c32,  BO,  10 * SIZE
+   ST c42,  BO,  11 * SIZE
+   ST c52,  BO,  12 * SIZE
+   ST c62,  BO,  13 * SIZE
+   ST c72,  BO,  14 * SIZE
+   ST c82,  BO,  15 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+   ST c31,  AO,   4 * SIZE
+   ST c32,  AO,   5 * SIZE
+   ST c41,  AO,   6 * SIZE
+   ST c42,  AO,   7 * SIZE
+   ST c51,  AO,   8 * SIZE
+   ST c52,  AO,   9 * SIZE
+   ST c61,  AO,  10 * SIZE
+   ST c62,  AO,  11 * SIZE
+   ST c71,  AO,  12 * SIZE
+   ST c72,  AO,  13 * SIZE
+   ST c81,  AO,  14 * SIZE
+   ST c82,  AO,  15 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c32,  CO3,   1 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c42,  CO4,   1 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c52,  CO5,   1 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c62,  CO6,   1 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c72,  CO7,   1 * SIZE
+   ST c81,  CO8,   0 * SIZE
+   ST c82,  CO8,   1 * SIZE
+MTC  a1, $r0
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+   addi.d  CO3, CO3, 2 * SIZE
+   addi.d  CO4, CO4, 2 * SIZE
+   addi.d  CO5, CO5, 2 * SIZE
+   addi.d  CO6, CO6, 2 * SIZE
+   addi.d  CO7, CO7, 2 * SIZE
+   addi.d  CO8, CO8, 2 * SIZE
+#endif
+   MOV c11, a1
+   MOV c21, a1
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+   MOV c31, a1
+   MOV c41, a1
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+   addi.d  I, I, -1
+   MOV c51, a1
+MOV    c61, a1
+   blt $r0,    I, .L11
+   .align 3
+
+.L20:
+   andi    I,  M, 1
+   MOV c61, c11
+MOV    c71, c11
+   bge $r0,    I, .L29
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+   MOV c81, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  0 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   MOV c81, c11
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,  20 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   9 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,  10 * SIZE
+   MADD  c81, b4, a1, c81
+   LD b4,  BO,  11 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a2, c51
+   LD b7,  BO,  28 * SIZE
+   MADD  c61, b2, a2, c61
+   LD b2,  BO,  17 * SIZE
+   MADD  c71, b3, a2, c71
+   LD b3,  BO,  18 * SIZE
+   MADD  c81, b4, a2, c81
+   LD b4,  BO,  19 * SIZE
+   LD a2,  AO,   5 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,  32 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,  21 * SIZE
+   MADD  c31, b3, a3, c31
+   LD b3,  BO,  22 * SIZE
+   MADD  c41, b4, a3, c41
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD b5,  BO,  36 * SIZE
+   MADD  c61, b2, a3, c61
+   LD b2,  BO,  25 * SIZE
+   MADD  c71, b3, a3, c71
+   LD b3,  BO,  26 * SIZE
+   MADD  c81, b4, a3, c81
+   LD b4,  BO,  27 * SIZE
+   LD a3,  AO,   2 * SIZE
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c11, b6, a4, c11
+   LD b6,  BO,   8 * SIZE
+   MADD  c21, b2, a4, c21
+   LD b2,  BO,  -3 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  -2 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  -1 * SIZE
+   MADD  c51, b7, a4, c51
+   LD b7,  BO,  12 * SIZE
+   MADD  c61, b2, a4, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a4, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a4, c81
+   LD b4,  BO,   3 * SIZE
+   LD a4,  AO,   3 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   MOV a2, a2
+   addi.d  AO, AO,  1 * SIZE
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,   4 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L26
+.L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -8
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c51, b5, c51
+   SUB c61, b6, c61
+   SUB c71, b7, c71
+   SUB c81, b8, c81
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c51, b5, c51
+   SUB c61, b6, c61
+   SUB c71, b7, c71
+   SUB c81, b8, c81
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   MUL c51, b1, c51
+   MUL c61, b1, c61
+   MUL c71, b1, c71
+   MUL c81, b1, c81
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c51, c11, b5, c51
+   NMSUB  c61, c11, b6, c61
+   NMSUB  c71, c11, b7, c71
+   NMSUB  c81, c11, b8, c81
+   LD b2,  BO,   9 * SIZE
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   LD b5,  BO,  12 * SIZE
+   LD b6,  BO,  13 * SIZE
+   LD b7,  BO,  14 * SIZE
+   LD b8,  BO,  15 * SIZE
+   MUL c21, b2, c21
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c51, c21, b5, c51
+   NMSUB  c61, c21, b6, c61
+   NMSUB  c71, c21, b7, c71
+   NMSUB  c81, c21, b8, c81
+   LD b3,  BO,  18 * SIZE
+   LD b4,  BO,  19 * SIZE
+   LD b5,  BO,  20 * SIZE
+   LD b6,  BO,  21 * SIZE
+   LD b7,  BO,  22 * SIZE
+   LD b8,  BO,  23 * SIZE
+   MUL c31, b3, c31
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c51, c31, b5, c51
+   NMSUB  c61, c31, b6, c61
+   NMSUB  c71, c31, b7, c71
+   NMSUB  c81, c31, b8, c81
+   LD b4,  BO,  27 * SIZE
+   LD b5,  BO,  28 * SIZE
+   LD b6,  BO,  29 * SIZE
+   LD b7,  BO,  30 * SIZE
+   LD b8,  BO,  31 * SIZE
+   MUL c41, b4, c41
+   NMSUB  c51, c41, b5, c51
+   NMSUB  c61, c41, b6, c61
+   NMSUB  c71, c41, b7, c71
+   NMSUB  c81, c41, b8, c81
+   LD b5,  BO,  36 * SIZE
+   LD b6,  BO,  37 * SIZE
+   LD b7,  BO,  38 * SIZE
+   LD b8,  BO,  39 * SIZE
+   MUL c51, b5, c51
+   NMSUB  c61, c51, b6, c61
+   NMSUB  c71, c51, b7, c71
+   NMSUB  c81, c51, b8, c81
+   LD b6,  BO,  45 * SIZE
+   LD b7,  BO,  46 * SIZE
+   LD b8,  BO,  47 * SIZE
+   MUL c61, b6, c61
+   NMSUB  c71, c61, b7, c71
+   NMSUB  c81, c61, b8, c81
+   LD b7,  BO,  54 * SIZE
+   LD b8,  BO,  55 * SIZE
+   MUL c71, b7, c71
+   NMSUB  c81, c71, b8, c81
+   LD b8,  BO,  63 * SIZE
+   MUL c81, b8, c81
+#endif
+#ifdef RT
+   LD b1,  BO,  63 * SIZE
+   LD b2,  BO,  62 * SIZE
+   LD b3,  BO,  61 * SIZE
+   LD b4,  BO,  60 * SIZE
+   LD b5,  BO,  59 * SIZE
+   LD b6,  BO,  58 * SIZE
+   LD b7,  BO,  57 * SIZE
+   LD b8,  BO,  56 * SIZE
+   MUL c81, b1, c81
+   NMSUB  c71, c81, b2, c71
+   NMSUB  c61, c81, b3, c61
+   NMSUB  c51, c81, b4, c51
+   NMSUB  c41, c81, b5, c41
+   NMSUB  c31, c81, b6, c31
+   NMSUB  c21, c81, b7, c21
+   NMSUB  c11, c81, b8, c11
+   LD b2,  BO,  54 * SIZE
+   LD b3,  BO,  53 * SIZE
+   LD b4,  BO,  52 * SIZE
+   LD b5,  BO,  51 * SIZE
+   LD b6,  BO,  50 * SIZE
+   LD b7,  BO,  49 * SIZE
+   LD b8,  BO,  48 * SIZE
+   MUL c71, b2, c71
+   NMSUB  c61, c71, b3, c61
+   NMSUB  c51, c71, b4, c51
+   NMSUB  c41, c71, b5, c41
+   NMSUB  c31, c71, b6, c31
+   NMSUB  c21, c71, b7, c21
+   NMSUB  c11, c71, b8, c11
+   LD b3,  BO,  45 * SIZE
+   LD b4,  BO,  44 * SIZE
+   LD b5,  BO,  43 * SIZE
+   LD b6,  BO,  42 * SIZE
+   LD b7,  BO,  41 * SIZE
+   LD b8,  BO,  40 * SIZE
+   MUL c61, b3, c61
+   NMSUB  c51, c61, b4, c51
+   NMSUB  c41, c61, b5, c41
+   NMSUB  c31, c61, b6, c31
+   NMSUB  c21, c61, b7, c21
+   NMSUB  c11, c61, b8, c11
+   LD b4,  BO,  36 * SIZE
+   LD b5,  BO,  35 * SIZE
+   LD b6,  BO,  34 * SIZE
+   LD b7,  BO,  33 * SIZE
+   LD b8,  BO,  32 * SIZE
+   MUL c51, b4, c51
+   NMSUB  c41, c51, b5, c41
+   NMSUB  c31, c51, b6, c31
+   NMSUB  c21, c51, b7, c21
+   NMSUB  c11, c51, b8, c11
+   LD b5,  BO,  27 * SIZE
+   LD b6,  BO,  26 * SIZE
+   LD b7,  BO,  25 * SIZE
+   LD b8,  BO,  24 * SIZE
+   MUL c41, b5, c41
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c11, c41, b8, c11
+   LD b6,  BO,  18 * SIZE
+   LD b7,  BO,  17 * SIZE
+   LD b8,  BO,  16 * SIZE
+   MUL c31, b6, c31
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c11, c31, b8, c11
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c21, b7, c21
+   NMSUB  c11, c21, b8, c11
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+   addi.d  CO3, CO3, -1 * SIZE
+   addi.d  CO4, CO4, -1 * SIZE
+   addi.d  CO5, CO5, -1 * SIZE
+   addi.d  CO6, CO6, -1 * SIZE
+   addi.d  CO7, CO7, -1 * SIZE
+   addi.d  CO8, CO8, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c61,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c81,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c41,  AO,   3 * SIZE
+   ST c51,  AO,   4 * SIZE
+   ST c61,  AO,   5 * SIZE
+   ST c71,  AO,   6 * SIZE
+   ST c81,  AO,   7 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c81,  CO8,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+   addi.d  CO3, CO3, 1 * SIZE
+   addi.d  CO4, CO4, 1 * SIZE
+   addi.d  CO5, CO5, 1 * SIZE
+   addi.d  CO6, CO6, 1 * SIZE
+   addi.d  CO7, CO7, 1 * SIZE
+   addi.d  CO8, CO8, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L29:
+#ifdef LN
+   slli.d  TEMP, K, 3 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  8
+#endif
+#ifdef RT
+   addi.d  KK, KK, -8
+#endif
+   blt $r0,    J, .L10
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   fld.d  $f24,  $sp,  48
+   fld.d  $f25,  $sp,  56
+   fld.d  $f26,  $sp,  64
+   fld.d  $f27,  $sp,  72
+   fld.d  $f28,  $sp,  80
+   LDARG  $r29,  $sp,   88
+   LDARG  $r30,  $sp,   96
+   LDARG  $r20,  $sp,  104
+   LDARG  $r16,  $sp,  112
+#ifndef __64BIT__
+   fld.d  $f18,  $sp, 112
+   fld.d  $f19,  $sp, 120
+   fld.d  $f20,  $sp, 128
+   fld.d  $f21,  $sp, 136
+#endif
+   addi.d  $sp, $sp, 144
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zamax.S b/kernel/loongarch64/zamax.S
new file mode 100644
index 000000000..f998bdc23
--- /dev/null
+++ b/kernel/loongarch64/zamax.S
@@ -0,0 +1,190 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define t5     $f4
+#define t6     $f5
+#define t7     $f6
+#define t8     $f7
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD    s1, t1, t2
+   bge $r0,    N, .L999
+   ADD s2, t1, t2
+   srai.d  I, N, 2
+   ADD s3, t1, t2
+   ADD    s4, t1, t2
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   LD a2,  X,   1 * SIZE
+   FABS    t3, a3
+   add.d   X, X, INCX
+   FABS    t4, a4
+   FABS    t5, a5
+   LD a3,  X,   0 * SIZE
+   FABS    t6, a6
+   LD a4,  X,   1 * SIZE
+   FABS    t7, a7
+   add.d   X, X, INCX
+   FABS    t8, a8
+   ADD t1, t1, t2
+   LD a5,  X,   0 * SIZE
+   ADD t3, t3, t4
+   LD a6,  X,   1 * SIZE
+   ADD t5, t5, t6
+   add.d   X, X, INCX
+   ADD t7, t7, t8
+   CMPLT   $fcc0, s1, t1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t3
+   LD a8,  X,   1 * SIZE
+   CMPLT   $fcc2, s3, t5
+   add.d   X, X, INCX
+   CMPLT   $fcc3, s4, t7
+   CMOVT  s1,  s1,  t1,  $fcc0
+   addi.d  I, I, -1
+   CMOVT  s2,  s2,  t3,  $fcc1
+   CMOVT  s3,  s3,  t5,  $fcc2
+   CMOVT  s4,  s4,  t7,  $fcc3
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   FABS    t5, a5
+   FABS    t6, a6
+   FABS    t7, a7
+   FABS    t8, a8
+   ADD t1, t1, t2
+   ADD t3, t3, t4
+   ADD t5, t5, t6
+   ADD t7, t7, t8
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t3
+   CMPLT   $fcc2, s3, t5
+   CMPLT   $fcc3, s4, t7
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t3,  $fcc1
+   CMOVT  s3,  s3,  t5,  $fcc2
+   CMOVT  s4,  s4,  t7,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD t1, t1, t2
+   CMPLT   $fcc0, s1, t1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zamin.S b/kernel/loongarch64/zamin.S
new file mode 100644
index 000000000..bde9aebf8
--- /dev/null
+++ b/kernel/loongarch64/zamin.S
@@ -0,0 +1,198 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+#define I     $r17
+#define TEMP  $r18
+#define a1    $f10
+#define a2    $f11
+#define a3    $f12
+#define a4    $f13
+#define a5    $f14
+#define a6    $f15
+#define a7    $f16
+#define a8    $f17
+#define t1    $f0
+#define t2    $f1
+#define t3    $f2
+#define t4    $f3
+#define t5    $f4
+#define t6    $f5
+#define t7    $f6
+#define t8    $f7
+#define s1    $f22
+#define s2    $f8
+#define s3    $f23
+#define s4    $f9
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD    s1, t1, t2
+   bge $r0,    N, .L999
+   NOP
+   ADD s2, t1, t2
+   srai.d  I, N, 2
+   ADD s3, t1, t2
+   ADD    s4, t1, t2
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   LD a2,  X,   1 * SIZE
+   FABS    t3, a3
+   add.d   X, X, INCX
+   FABS    t4, a4
+   NOP
+   FABS    t5, a5
+   LD a3,  X,   0 * SIZE
+   FABS    t6, a6
+   LD a4,  X,   1 * SIZE
+   FABS    t7, a7
+   add.d   X, X, INCX
+   FABS    t8, a8
+   NOP
+   ADD t1, t1, t2
+   LD a5,  X,   0 * SIZE
+   ADD t3, t3, t4
+   LD a6,  X,   1 * SIZE
+   ADD t5, t5, t6
+   add.d   X, X, INCX
+   ADD t7, t7, t8
+   NOP
+   CMPLT   $fcc0, t1, s1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, t3, s2
+   LD a8,  X,   1 * SIZE
+   CMPLT   $fcc2, t5, s3
+   add.d   X, X, INCX
+   CMPLT   $fcc3, t7, s4
+   NOP
+   CMOVT  s1,  s1,  t1,  $fcc0
+   addi.d  I, I, -1
+   CMOVT  s2,  s2,  t3,  $fcc1
+   NOP
+   CMOVT  s3,  s3,  t5,  $fcc2
+   CMOVT  s4,  s4,  t7,  $fcc3
+   blt $r0,    I, .L12
+   NOP
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   FABS    t5, a5
+   FABS    t6, a6
+   FABS    t7, a7
+   FABS    t8, a8
+   ADD t1, t1, t2
+   ADD t3, t3, t4
+   ADD t5, t5, t6
+   ADD t7, t7, t8
+   CMPLT   $fcc0, t1, s1
+   CMPLT   $fcc1, t3, s2
+   CMPLT   $fcc2, t5, s3
+   CMPLT   $fcc3, t7, s4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t3,  $fcc1
+   CMOVT  s3,  s3,  t5,  $fcc2
+   CMOVT  s4,  s4,  t7,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD t1, t1, t2
+   CMPLT   $fcc0, t1, s1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s2, s1
+   CMPLT   $fcc1, s4, s3
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s3, s1
+   CMOVT  s1,  s1,  s3,  $fcc0
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   NOP
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zasum.S b/kernel/loongarch64/zasum.S
new file mode 100644
index 000000000..d1a1a732c
--- /dev/null
+++ b/kernel/loongarch64/zasum.S
@@ -0,0 +1,158 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f23
+#define a2     $f9
+#define a3     $f10
+#define a4     $f11
+#define a5     $f12
+#define a6     $f13
+#define a7     $f14
+#define a8     $f15
+#define t1     $f16
+#define t2     $f17
+#define t3     $f0
+#define t4     $f1
+#define s1     $f22
+#define s2     $f8
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   MTC  s2, $r0
+   slli.d  INCX, INCX, ZBASE_SHIFT
+   srai.d I, N, 2
+   bge $r0,    N, .L999
+   bge $r0,    I, .L25
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   FABS    t2, a2
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   FABS    t3, a3
+   FABS    t4, a4
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L24
+   .align 3
+
+.L23:
+   ADD s1, s1, t1
+   LD a1,  X,   0 * SIZE
+   FABS    t1, a5
+   addi.d  I, I, -1
+   ADD s2, s2, t2
+   LD a2,  X,   1 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+   ADD s1, s1, t3
+   LD a3,  X,   0 * SIZE
+   FABS    t3, a7
+   NOP
+   ADD s2, s2, t4
+   LD a4,  X,   1 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   ADD s1, s1, t1
+   LD a5,  X,   0 * SIZE
+   FABS    t1, a1
+   NOP
+   ADD s2, s2, t2
+   LD a6,  X,   1 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   ADD s1, s1, t3
+   LD a7,  X,   0 * SIZE
+   FABS    t3, a3
+   LD a8,  X,   1 * SIZE
+   ADD s2, s2, t4
+   add.d   X, X, INCX
+   FABS   t4, a4
+   blt $r0,    I, .L23
+   .align 3
+
+.L24:
+   ADD s1, s1, t1
+   FABS    t1, a5
+   ADD s2, s2, t2
+   FABS    t2, a6
+   ADD s1, s1, t3
+   FABS    t3, a7
+   ADD s2, s2, t4
+   FABS    t4, a8
+   ADD s1, s1, t1
+   ADD s2, s2, t2
+   ADD s1, s1, t3
+   ADD s2, s2, t4
+   .align 3
+
+.L25:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+
+.L26:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   FABS    t1, a1
+   addi.d  I, I, -1
+   FABS    t2, a2
+   add.d   X, X, INCX
+   ADD s1, s1, t1
+   ADD    s2, s2, t2
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+   ADD s1, s1, s2
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zcopy.S b/kernel/loongarch64/zcopy.S
new file mode 100644
index 000000000..3fbe56074
--- /dev/null
+++ b/kernel/loongarch64/zcopy.S
@@ -0,0 +1,217 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f22
+#define a2     $f8
+#define a3     $f23
+#define a4     $f9
+#define a5     $f10
+#define a6     $f11
+#define a7     $f12
+#define a8     $f13
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+   LDINT   INCY,  0(INCY)
+#endif
+
+   li  TEMP, 2 * SIZE
+   NOP
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    N, .L999
+   slli.d INCY, INCY, ZBASE_SHIFT
+   bne INCX, TEMP, .L20
+   srai.d I, N, 2
+   bne INCY, TEMP, .L20
+   addi.d I, I, -1
+   blt     I, $r0,     .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   LD a6,  X,   5 * SIZE
+   LD a7,  X,   6 * SIZE
+   LD a8,  X,   7 * SIZE
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   ST a1,  Y,   0 * SIZE
+   LD a1,  X,   8 * SIZE
+   ST a2,  Y,   1 * SIZE
+   LD a2,  X,   9 * SIZE
+   ST a3,  Y,   2 * SIZE
+   LD a3,  X,  10 * SIZE
+   ST a4,  Y,   3 * SIZE
+   LD a4,  X,  11 * SIZE
+   ST a5,  Y,   4 * SIZE
+   LD a5,  X,  12 * SIZE
+   ST a6,  Y,   5 * SIZE
+   LD a6,  X,  13 * SIZE
+   ST a7,  Y,   6 * SIZE
+   LD a7,  X,  14 * SIZE
+   ST a8,  Y,   7 * SIZE
+   LD a8,  X,  15 * SIZE
+   addi.d  I, I, -1
+   addi.d  X, X, 8 * SIZE
+   addi.d Y, Y, 8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   ST a1,  Y,   0 * SIZE
+   ST a2,  Y,   1 * SIZE
+   ST a3,  Y,   2 * SIZE
+   ST a4,  Y,   3 * SIZE
+   ST a5,  Y,   4 * SIZE
+   ST a6,  Y,   5 * SIZE
+   ST a7,  Y,   6 * SIZE
+   ST a8,  Y,   7 * SIZE
+   addi.d  X, X, 8 * SIZE
+   addi.d  Y, Y, 8 * SIZE
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   addi.d  X, X, 2 * SIZE
+   addi.d  Y, Y, 2 * SIZE
+   ST a1,  Y,  -2 * SIZE
+   addi.d  I, I, -1
+   ST a2,  Y,  -1 * SIZE
+   blt $r0,    I, .L16
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   NOP
+   .align 3
+
+.L20:
+   srai.d  I, N, 2
+   addi.d I, I, -1
+   blt I,  $r0, .L25
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   add.d  X, X, INCX
+   bge $r0,    I, .L23
+   .align 3
+
+.L22:
+   ST a1,  Y,   0 * SIZE
+   LD a1,  X,   0 * SIZE
+   ST a2,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST a3,  Y,   0 * SIZE
+   LD a3,  X,   0 * SIZE
+   ST a4,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST a5,  Y,   0 * SIZE
+   LD a5,  X,   0 * SIZE
+   ST a6,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST a7,  Y,   0 * SIZE
+   LD a7,  X,   0 * SIZE
+   ST a8,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   LD a8,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   ST a1,  Y,   0 * SIZE
+   ST a2,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   ST a3,  Y,   0 * SIZE
+   ST a4,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   ST a5,  Y,   0 * SIZE
+   ST a6,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   ST a7,  Y,   0 * SIZE
+   ST a8,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   .align 3
+
+.L25:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+
+.L26:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   addi.d  I, I, -1
+   ST a1,  Y,   0 * SIZE
+   ST a2,  Y,   1 * SIZE
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zdot.S b/kernel/loongarch64/zdot.S
new file mode 100644
index 000000000..087c3845f
--- /dev/null
+++ b/kernel/loongarch64/zdot.S
@@ -0,0 +1,330 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define b1     $f14
+#define b2     $f15
+#define b3     $f16
+#define b4     $f17
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+   LDINT   INCY,  0(INCY)
+#endif
+
+   MTC  s1, $r0
+   MOV s2, s1
+   MOV s3, s2
+   MOV s4, s3
+   slli.d  INCX, INCX, ZBASE_SHIFT
+   li  TEMP, 2 * SIZE
+   slli.d INCY, INCY, ZBASE_SHIFT
+   bge $r0,    N, .L999
+   srai.d I, N, 2
+   bne INCX, TEMP, .L20
+   bne INCY, TEMP, .L20
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD b1,  Y,   0 * SIZE
+   addi.d  I, I, -1
+   LD b2,  Y,   1 * SIZE
+   bge $r0,    I, .L14
+   .align 3
+
+.L13:
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   2 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   3 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   2 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   3 * SIZE
+   MADD  s1, b3, a3, s1
+   LD a1,  X,   4 * SIZE
+   MADD  s2, b3, a4, s2
+   LD a2,  X,   5 * SIZE
+   MADD  s3, b4, a3, s3
+   LD b1,  Y,   4 * SIZE
+   MADD  s4, b4, a4, s4
+   LD b2,  Y,   5 * SIZE
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   6 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   7 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   6 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   7 * SIZE
+   MADD  s1, b3, a3, s1
+   LD a1,  X,   8 * SIZE
+   MADD  s2, b3, a4, s2
+   LD a2,  X,   9 * SIZE
+   MADD  s3, b4, a3, s3
+   LD b1,  Y,   8 * SIZE
+   MADD  s4, b4, a4, s4
+   LD b2,  Y,   9 * SIZE
+   addi.d  I, I, -1
+   addi.d  X, X, 8 * SIZE
+   addi.d Y, Y, 8 * SIZE
+   blt $r0,    I, .L13
+   .align 3
+
+.L14:
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   2 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   3 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   2 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   3 * SIZE
+   MADD  s1, b3, a3, s1
+   LD a1,  X,   4 * SIZE
+   MADD  s2, b3, a4, s2
+   LD a2,  X,   5 * SIZE
+   MADD  s3, b4, a3, s3
+   LD b1,  Y,   4 * SIZE
+   MADD  s4, b4, a4, s4
+   LD b2,  Y,   5 * SIZE
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   6 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   7 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   6 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   7 * SIZE
+   MADD  s1, b3, a3, s1
+   addi.d  X, X, 8 * SIZE
+   MADD  s2, b3, a4, s2
+   addi.d  Y, Y, 8 * SIZE
+   MADD  s3, b4, a3, s3
+   MADD  s4, b4, a4, s4
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD b1,  Y,   0 * SIZE
+   addi.d  I, I, -1
+   LD b2,  Y,   1 * SIZE
+   bge $r0,    I, .L17
+   .align 3
+
+.L16:
+   MADD  s1, b1, a1, s1
+   addi.d  I, I, -1
+   MADD  s2, b1, a2, s2
+   LD b1,  Y,   2 * SIZE
+   MADD  s3, b2, a1, s3
+   LD a1,  X,   2 * SIZE
+   MADD  s4, b2, a2, s4
+   LD a2,  X,   3 * SIZE
+   LD b2,  Y,   3 * SIZE
+   addi.d  X, X, 2 * SIZE
+   addi.d Y, Y, 2 * SIZE
+   blt $r0,    I, .L16
+   .align 3
+
+.L17:
+   MADD  s1, b1, a1, s1
+   MADD  s2, b1, a2, s2
+   MADD  s3, b2, a1, s3
+   MADD  s4, b2, a2, s4
+   b   .L999
+   .align 3
+
+.L20:
+#ifdef F_INTERFACE
+   bgez    INCX, .L21
+   addi.d  TEMP, N, -1
+   mult    TEMP, INCX
+   mflo    TEMP
+   dsub    X, X, TEMP
+   .align 3
+
+.L21:
+   bgez    INCY, .L22
+   addi.d  TEMP, N, -1
+   mult    TEMP, INCY
+   mflo    TEMP
+   dsub    Y, Y, TEMP
+   .align 3
+
+.L22:
+#endif
+   bge $r0,    I, .L25
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD b1,  Y,   0 * SIZE
+   LD b2,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   addi.d  I, I, -1
+   add.d  Y, Y, INCY
+   bge $r0,    I, .L24
+   .align 3
+
+.L23:
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   0 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   1 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   0 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   add.d   Y, Y, INCY
+   MADD  s1, b3, a3, s1
+   LD a1,  X,   0 * SIZE
+   MADD  s2, b3, a4, s2
+   LD a2,  X,   1 * SIZE
+   MADD  s3, b4, a3, s3
+   LD b1,  Y,   0 * SIZE
+   MADD  s4, b4, a4, s4
+   LD b2,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   add.d   Y, Y, INCY
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   0 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   1 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   0 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   add.d   Y, Y, INCY
+   MADD  s1, b3, a3, s1
+   LD a1,  X,   0 * SIZE
+   MADD  s2, b3, a4, s2
+   LD a2,  X,   1 * SIZE
+   MADD  s3, b4, a3, s3
+   LD b1,  Y,   0 * SIZE
+   MADD  s4, b4, a4, s4
+   LD b2,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   addi.d  I, I, -1
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L23
+   .align 3
+
+.L24:
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   0 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   1 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   0 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   add.d   Y, Y, INCY
+   MADD  s1, b3, a3, s1
+   LD a1,  X,   0 * SIZE
+   MADD  s2, b3, a4, s2
+   LD a2,  X,   1 * SIZE
+   MADD  s3, b4, a3, s3
+   LD b1,  Y,   0 * SIZE
+   MADD  s4, b4, a4, s4
+   LD b2,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   add.d   Y, Y, INCY
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   0 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   1 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   0 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   1 * SIZE
+   MADD  s1, b3, a3, s1
+   add.d   X, X, INCX
+   MADD  s2, b3, a4, s2
+   add.d   Y, Y, INCY
+   MADD  s3, b4, a3, s3
+   MADD  s4, b4, a4, s4
+   .align 3
+
+.L25:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+.L26:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD b1,  Y,   0 * SIZE
+   LD b2,  Y,   1 * SIZE
+   MADD  s1, b1, a1, s1
+   MADD  s2, b1, a2, s2
+   MADD  s3, b2, a1, s3
+   MADD  s4, b2, a2, s4
+   add.d   X, X, INCX
+   add.d   Y, Y, INCY
+   addi.d  I, I, -1
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+#ifndef CONJ
+   SUB $f0, s1, s4
+#else
+   ADD $f0, s1, s4
+#endif
+#ifndef CONJ
+   ADD $f1, s3, s2
+#else
+   SUB $f1, s3, s2
+#endif
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zgemm3m_kernel.S b/kernel/loongarch64/zgemm3m_kernel.S
new file mode 100644
index 000000000..f9acb6cfc
--- /dev/null
+++ b/kernel/loongarch64/zgemm3m_kernel.S
@@ -0,0 +1,1359 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r11
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define CO5    $r25
+#define CO6    $r26
+#define CO7    $r27
+#define CO8    $r28
+
+#define a1     $f22
+#define a2     $f8
+#define a3     $f28
+#define a4     $f29
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f3
+#define c22    $f4
+#define c31    $f2
+#define c32    $f5
+#define c41    $f6
+#define c42    $f7
+#define c51    $f18
+#define c52    $f19
+#define c61    $f20
+#define c62    $f21
+#define c71    $f24
+#define c72    $f25
+#define c81    $f26
+#define c82    $f27
+#define ALPHA_R    $f0
+#define ALPHA_I    $f1
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -128
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   fst.d  $f24,  $sp,  48
+   fst.d  $f25,  $sp,  56
+   fst.d  $f26,  $sp,  64
+   fst.d  $f27,  $sp,  72
+   fst.d  $f28,  $sp,  80
+   fst.d  $f29,  $sp,  88
+   slli.d     LDC,    LDC, ZBASE_SHIFT
+   srai.d  J,  N, 3
+   bge $r0,    J, .L30
+.L10:
+   move    CO1, C
+   MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   move    AO, A
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   CO5, CO4,    LDC
+   MOV c31, c11
+   add.d   CO6, CO5,    LDC
+   MOV c41, c11
+   add.d   CO7, CO6,    LDC
+   MOV c51, c11
+   add.d   CO8, CO7,    LDC
+   srai.d  I,  M, 1
+   add.d   C,   CO8,    LDC
+MOV    c61, c11
+   bge $r0,    I, .L20
+.L11:
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  K, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD a4,  AO,   2 * SIZE
+   MADD  c61, b2, a1, c61
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD a4,  AO,   6 * SIZE
+   MADD  c61, b2, a3, c61
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+   andi    L,  K, 3
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO1,  2 * SIZE
+   LD $f9,  CO1,  3 * SIZE
+   LD $f10,  CO2,  0 * SIZE
+   MADD  $f22, c11, ALPHA_R, $f22
+   LD $f11,  CO2,  1 * SIZE
+   MADD  $f8, c11, ALPHA_I, $f8
+   LD $f12,  CO2,  2 * SIZE
+   MADD  $f23, c12, ALPHA_R, $f23
+   LD $f13,  CO2,  3 * SIZE
+   MADD  $f9, c12, ALPHA_I, $f9
+   MADD  $f10, c21, ALPHA_R, $f10
+   ST $f22,  CO1,   0 * SIZE
+   MADD  $f11, c21, ALPHA_I, $f11
+   ST $f8,  CO1,   1 * SIZE
+   MADD  $f12, c22, ALPHA_R, $f12
+   ST $f23,  CO1,   2 * SIZE
+   MADD  $f13, c22, ALPHA_I, $f13
+   ST $f9,  CO1,   3 * SIZE
+   LD $f22,  CO3,  0 * SIZE
+   LD $f8,  CO3,  1 * SIZE
+   LD $f23,  CO3,  2 * SIZE
+   LD $f9,  CO3,  3 * SIZE
+   ST $f10,  CO2,   0 * SIZE
+   ST $f11,  CO2,   1 * SIZE
+   ST $f12,  CO2,   2 * SIZE
+   ST $f13,  CO2,   3 * SIZE
+   LD $f10,  CO4,  0 * SIZE
+   LD $f11,  CO4,  1 * SIZE
+   LD $f12,  CO4,  2 * SIZE
+   LD $f13,  CO4,  3 * SIZE
+   MADD  $f22, c31, ALPHA_R, $f22
+   MADD  $f8, c31, ALPHA_I, $f8
+   MADD  $f23, c32, ALPHA_R, $f23
+   MADD  $f9, c32, ALPHA_I, $f9
+   MADD  $f10, c41, ALPHA_R, $f10
+   ST $f22,  CO3,   0 * SIZE
+   MADD  $f11, c41, ALPHA_I, $f11
+   ST $f8,  CO3,   1 * SIZE
+   MADD  $f12, c42, ALPHA_R, $f12
+   ST $f23,  CO3,   2 * SIZE
+   MADD  $f13, c42, ALPHA_I, $f13
+   ST $f9,  CO3,   3 * SIZE
+   LD $f22,  CO5,  0 * SIZE
+   LD $f8,  CO5,  1 * SIZE
+   LD $f23,  CO5,  2 * SIZE
+   LD $f9,  CO5,  3 * SIZE
+   ST $f10,  CO4,   0 * SIZE
+   ST $f11,  CO4,   1 * SIZE
+   ST $f12,  CO4,   2 * SIZE
+   ST $f13,  CO4,   3 * SIZE
+   LD $f10,  CO6,  0 * SIZE
+   LD $f11,  CO6,  1 * SIZE
+   LD $f12,  CO6,  2 * SIZE
+   LD $f13,  CO6,  3 * SIZE
+   MADD  $f22, c51, ALPHA_R, $f22
+   addi.d  CO1,CO1, 4 * SIZE
+   MADD  $f8, c51, ALPHA_I, $f8
+   addi.d  CO2,CO2, 4 * SIZE
+   MADD  $f23, c52, ALPHA_R, $f23
+   addi.d  CO3,CO3, 4 * SIZE
+   MADD  $f9, c52, ALPHA_I, $f9
+   addi.d  CO4,CO4, 4 * SIZE
+   MADD  $f10, c61, ALPHA_R, $f10
+   ST $f22,  CO5,   0 * SIZE
+   MADD  $f11, c61, ALPHA_I, $f11
+   ST $f8,  CO5,   1 * SIZE
+   MADD  $f12, c62, ALPHA_R, $f12
+   ST $f23,  CO5,   2 * SIZE
+   MADD  $f13, c62, ALPHA_I, $f13
+   ST $f9,  CO5,   3 * SIZE
+   LD $f22,  CO7,  0 * SIZE
+   LD $f8,  CO7,  1 * SIZE
+   LD $f23,  CO7,  2 * SIZE
+   LD $f9,  CO7,  3 * SIZE
+   ST $f10,  CO6,   0 * SIZE
+   ST $f11,  CO6,   1 * SIZE
+   ST $f12,  CO6,   2 * SIZE
+   ST $f13,  CO6,   3 * SIZE
+   LD $f10,  CO8,  0 * SIZE
+   addi.d  I, I, -1
+   LD $f11,  CO8,  1 * SIZE
+MTC  c11, $r0
+   LD $f12,  CO8,  2 * SIZE
+   LD $f13,  CO8,  3 * SIZE
+   MADD  $f22, c71, ALPHA_R, $f22
+   addi.d  CO5,CO5, 4 * SIZE
+   MADD  $f8, c71, ALPHA_I, $f8
+   addi.d  CO6,CO6, 4 * SIZE
+   MADD  $f23, c72, ALPHA_R, $f23
+   addi.d  CO7,CO7, 4 * SIZE
+   MADD  $f9, c72, ALPHA_I, $f9
+   addi.d  CO8,CO8, 4 * SIZE
+   MADD  $f10, c81, ALPHA_R, $f10
+   ST $f22,  CO7,  -4 * SIZE
+   MADD  $f11, c81, ALPHA_I, $f11
+   ST $f8,  CO7,  -3 * SIZE
+   MADD  $f12, c82, ALPHA_R, $f12
+   ST $f23,  CO7,  -2 * SIZE
+   MADD  $f13, c82, ALPHA_I, $f13
+   ST $f9,  CO7,  -1 * SIZE
+   ST $f10,  CO8,  -4 * SIZE
+   MOV c21, c11
+   ST $f11,  CO8,  -3 * SIZE
+   MOV c31, c11
+   ST $f12,  CO8,  -2 * SIZE
+   MOV c41, c11
+   ST $f13,  CO8,  -1 * SIZE
+   MOV c51, c11
+MOV    c61, c11
+   blt $r0,    I, .L11
+   .align 3
+
+.L20:
+   andi    I,  M, 1
+   MOV c61, c11
+MOV    c71, c11
+   bge $r0,    I, .L29
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  K, 2
+   MOV c81, c11
+move   BO,  B
+   bge $r0,    L, .L25
+   .align  3
+.L22:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,  20 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   9 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,  10 * SIZE
+   MADD  c81, b4, a1, c81
+   LD b4,  BO,  11 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a2, c51
+   LD b7,  BO,  28 * SIZE
+   MADD  c61, b2, a2, c61
+   LD b2,  BO,  17 * SIZE
+   MADD  c71, b3, a2, c71
+   LD b3,  BO,  18 * SIZE
+   MADD  c81, b4, a2, c81
+   LD b4,  BO,  19 * SIZE
+   LD a2,  AO,   5 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,  32 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,  21 * SIZE
+   MADD  c31, b3, a3, c31
+   LD b3,  BO,  22 * SIZE
+   MADD  c41, b4, a3, c41
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD b5,  BO,  36 * SIZE
+   MADD  c61, b2, a3, c61
+   LD b2,  BO,  25 * SIZE
+   MADD  c71, b3, a3, c71
+   LD b3,  BO,  26 * SIZE
+   MADD  c81, b4, a3, c81
+   LD b4,  BO,  27 * SIZE
+   LD a3,  AO,   2 * SIZE
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c11, b6, a4, c11
+   LD b6,  BO,   8 * SIZE
+   MADD  c21, b2, a4, c21
+   LD b2,  BO,  -3 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  -2 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  -1 * SIZE
+   MADD  c51, b7, a4, c51
+   LD b7,  BO,  12 * SIZE
+   MADD  c61, b2, a4, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a4, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a4, c81
+   LD b4,  BO,   3 * SIZE
+   LD a4,  AO,   3 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+   andi    L,  K, 3
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   MOV a2, a2
+   addi.d  AO, AO,  1 * SIZE
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,   4 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L26
+.L28:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO2,  0 * SIZE
+   LD $f9,  CO2,  1 * SIZE
+   LD $f10,  CO3,  0 * SIZE
+   MADD  $f22, c11, ALPHA_R, $f22
+   LD $f11,  CO3,  1 * SIZE
+   MADD  $f8, c11, ALPHA_I, $f8
+   LD $f12,  CO4,  0 * SIZE
+   MADD  $f23, c21, ALPHA_R, $f23
+   LD $f13,  CO4,  1 * SIZE
+   MADD  $f9, c21, ALPHA_I, $f9
+   MADD  $f10, c31, ALPHA_R, $f10
+   ST $f22,  CO1,   0 * SIZE
+   MADD  $f11, c31, ALPHA_I, $f11
+   ST $f8,  CO1,   1 * SIZE
+   MADD  $f12, c41, ALPHA_R, $f12
+   ST $f23,  CO2,   0 * SIZE
+   MADD  $f13, c41, ALPHA_I, $f13
+   ST $f9,  CO2,   1 * SIZE
+   LD $f22,  CO5,  0 * SIZE
+   LD $f8,  CO5,  1 * SIZE
+   LD $f23,  CO6,  0 * SIZE
+   LD $f9,  CO6,  1 * SIZE
+   ST $f10,  CO3,   0 * SIZE
+   ST $f11,  CO3,   1 * SIZE
+   ST $f12,  CO4,   0 * SIZE
+   ST $f13,  CO4,   1 * SIZE
+   LD $f10,  CO7,  0 * SIZE
+   MADD  $f22, c51, ALPHA_R, $f22
+   LD $f11,  CO7,  1 * SIZE
+   MADD  $f8, c51, ALPHA_I, $f8
+   LD $f12,  CO8,  0 * SIZE
+   MADD  $f23, c61, ALPHA_R, $f23
+   LD $f13,  CO8,  1 * SIZE
+   MADD  $f9, c61, ALPHA_I, $f9
+   MADD  $f10, c71, ALPHA_R, $f10
+   ST $f22,  CO5,   0 * SIZE
+   MADD  $f11, c71, ALPHA_I, $f11
+   ST $f8,  CO5,   1 * SIZE
+   MADD  $f12, c81, ALPHA_R, $f12
+   ST $f23,  CO6,   0 * SIZE
+   MADD  $f13, c81, ALPHA_I, $f13
+   ST $f9,  CO6,   1 * SIZE
+   ST $f10,  CO7,   0 * SIZE
+   ST $f11,  CO7,   1 * SIZE
+   ST $f12,  CO8,   0 * SIZE
+   ST $f13,  CO8,   1 * SIZE
+   .align 3
+
+.L29:
+move   B, BO
+   blt $r0,    J, .L10
+   .align 3
+
+.L30:
+   andi    J,  N, 4
+move   AO, A
+   bge $r0,    J, .L50
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   C,   CO4,    LDC
+   MOV c31, c11
+   srai.d  I,  M, 1
+MOV    c41, c11
+   bge $r0,    I, .L40
+.L31:
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   MOV c32, c11
+   LD b4,  B,   3 * SIZE
+   MOV c42, c11
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  K, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L35
+   .align  3
+.L32:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c11, b7, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c31, b3, a3, c31
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD  c12, b7, a2, c12
+   LD b7,  BO,  12 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+   andi    L,  K, 3
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   0 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   4 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L36
+.L38:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO1,  2 * SIZE
+   LD $f9,  CO1,  3 * SIZE
+   LD $f10,  CO2,  0 * SIZE
+   LD $f11,  CO2,  1 * SIZE
+   LD $f12,  CO2,  2 * SIZE
+   LD $f13,  CO2,  3 * SIZE
+   MADD  $f22, c11, ALPHA_R, $f22
+   MADD  $f8, c11, ALPHA_I, $f8
+   MADD  $f23, c12, ALPHA_R, $f23
+   MADD  $f9, c12, ALPHA_I, $f9
+   MADD  $f10, c21, ALPHA_R, $f10
+   ST $f22,  CO1,   0 * SIZE
+   MADD  $f11, c21, ALPHA_I, $f11
+   ST $f8,  CO1,   1 * SIZE
+   MADD  $f12, c22, ALPHA_R, $f12
+   ST $f23,  CO1,   2 * SIZE
+   MADD  $f13, c22, ALPHA_I, $f13
+   ST $f9,  CO1,   3 * SIZE
+   LD $f22,  CO3,  0 * SIZE
+   LD $f8,  CO3,  1 * SIZE
+   LD $f23,  CO3,  2 * SIZE
+   LD $f9,  CO3,  3 * SIZE
+   ST $f10,  CO2,   0 * SIZE
+   MADD  $f22, c31, ALPHA_R, $f22
+   ST $f11,  CO2,   1 * SIZE
+   MADD  $f8, c31, ALPHA_I, $f8
+   ST $f12,  CO2,   2 * SIZE
+   MADD  $f23, c32, ALPHA_R, $f23
+   ST $f13,  CO2,   3 * SIZE
+   MADD  $f9, c32, ALPHA_I, $f9
+   LD $f10,  CO4,  0 * SIZE
+   LD $f11,  CO4,  1 * SIZE
+   LD $f12,  CO4,  2 * SIZE
+   LD $f13,  CO4,  3 * SIZE
+   MADD  $f10, c41, ALPHA_R, $f10
+   addi.d  CO1,CO1, 4 * SIZE
+   MADD  $f11, c41, ALPHA_I, $f11
+   addi.d  CO2,CO2, 4 * SIZE
+   MADD  $f12, c42, ALPHA_R, $f12
+   addi.d  CO3,CO3, 4 * SIZE
+   MADD  $f13, c42, ALPHA_I, $f13
+   addi.d  CO4,CO4, 4 * SIZE
+   ST $f22,  CO3,  -4 * SIZE
+   addi.d  I, I, -1
+   ST $f8,  CO3,  -3 * SIZE
+   ST $f23,  CO3,  -2 * SIZE
+   ST $f9,  CO3,  -1 * SIZE
+   ST $f10,  CO4,  -4 * SIZE
+MTC  c11, $r0
+   ST $f11,  CO4,  -3 * SIZE
+   MOV c21, c11
+   ST $f12,  CO4,  -2 * SIZE
+   MOV c31, c11
+   ST $f13,  CO4,  -1 * SIZE
+MOV    c41, c11
+   blt $r0,    I, .L31
+   .align 3
+
+.L40:
+   andi    I,  M, 1
+MOV    c61, c11
+   bge $r0,    I, .L49
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  K, 2
+move   BO,  B
+   bge $r0,    L, .L45
+   .align  3
+.L42:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b5, a2, c11
+   LD b5,  BO,  20 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  11 * SIZE
+   LD a2,  AO,   2 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   LD a2,  AO,  -1 * SIZE
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c11, b7, a2, c11
+   LD b7,  BO,  12 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   1 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   2 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   3 * SIZE
+   LD a2,  AO,   1 * SIZE
+   blt $r0,    L, .L42
+   .align 3
+
+.L45:
+   andi    L,  K, 3
+   bge $r0,    L, .L48
+   .align  3
+.L46:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   1 * SIZE
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+   MOV a2, a2
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L46
+.L48:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO2,  0 * SIZE
+   LD $f9,  CO2,  1 * SIZE
+   LD $f10,  CO3,  0 * SIZE
+   MADD  $f22, c11, ALPHA_R, $f22
+   LD $f11,  CO3,  1 * SIZE
+   MADD  $f8, c11, ALPHA_I, $f8
+   LD $f12,  CO4,  0 * SIZE
+   MADD  $f23, c21, ALPHA_R, $f23
+   LD $f13,  CO4,  1 * SIZE
+   MADD  $f9, c21, ALPHA_I, $f9
+   MADD  $f10, c31, ALPHA_R, $f10
+   ST $f22,  CO1,   0 * SIZE
+   MADD  $f11, c31, ALPHA_I, $f11
+   ST $f8,  CO1,   1 * SIZE
+   MADD  $f12, c41, ALPHA_R, $f12
+   ST $f23,  CO2,   0 * SIZE
+   MADD  $f13, c41, ALPHA_I, $f13
+   ST $f9,  CO2,   1 * SIZE
+   ST $f10,  CO3,   0 * SIZE
+   ST $f11,  CO3,   1 * SIZE
+   ST $f12,  CO4,   0 * SIZE
+   ST $f13,  CO4,   1 * SIZE
+   .align 3
+
+.L49:
+   move    B, BO
+   .align 3
+
+.L50:
+   andi    J,  N, 2
+move   AO, A
+   bge $r0,    J, .L70
+   move    CO1, C
+   add.d   CO2, C,      LDC
+   srai.d  I,  M, 1
+add.d  C,   CO2,    LDC
+   bge $r0,    I, .L60
+.L51:
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  K, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L55
+   .align  3
+.L52:
+   MADD  c11, b1, a1, c11
+   LD a3,  AO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b4,  BO,   3 * SIZE
+   MADD  c12, b1, a2, c12
+   LD a4,  AO,   3 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b1,  BO,   8 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a1,  AO,   8 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   5 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,   6 * SIZE
+   MADD  c11, b5, a5, c11
+   LD a3,  AO,   6 * SIZE
+   MADD  c21, b2, a5, c21
+   LD b4,  BO,   7 * SIZE
+   MADD  c12, b5, a2, c12
+   LD a4,  AO,   7 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b5,  BO,  12 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a5,  AO,  12 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   9 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,  10 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L52
+   .align 3
+
+.L55:
+   andi    L,  K, 3
+   bge $r0,    L, .L58
+   .align  3
+.L56:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   3 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L56
+.L58:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO1,  2 * SIZE
+   LD $f9,  CO1,  3 * SIZE
+   LD $f10,  CO2,  0 * SIZE
+   LD $f11,  CO2,  1 * SIZE
+   LD $f12,  CO2,  2 * SIZE
+   LD $f13,  CO2,  3 * SIZE
+   MADD  $f22, c11, ALPHA_R, $f22
+   addi.d  I, I, -1
+   MADD  $f8, c11, ALPHA_I, $f8
+   addi.d  CO1,CO1, 4 * SIZE
+   MADD  $f23, c12, ALPHA_R, $f23
+   addi.d  CO2,CO2, 4 * SIZE
+   MADD  $f9, c12, ALPHA_I, $f9
+   MADD  $f10, c21, ALPHA_R, $f10
+   MADD  $f11, c21, ALPHA_I, $f11
+   MADD  $f12, c22, ALPHA_R, $f12
+   MADD  $f13, c22, ALPHA_I, $f13
+   ST $f22,  CO1,  -4 * SIZE
+   ST $f8,  CO1,  -3 * SIZE
+   ST $f23,  CO1,  -2 * SIZE
+   ST $f9,  CO1,  -1 * SIZE
+   ST $f10,  CO2,  -4 * SIZE
+   ST $f11,  CO2,  -3 * SIZE
+   ST $f12,  CO2,  -2 * SIZE
+   ST $f13,  CO2,  -1 * SIZE
+   blt $r0,    I, .L51
+   .align 3
+
+.L60:
+   andi    I,  M, 1
+   bge $r0,    I, .L69
+   srai.d  L,  K, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L65
+   .align  3
+.L62:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  11 * SIZE
+   LD a3,  AO,   6 * SIZE
+   LD a4,  AO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L62
+   .align 3
+
+.L65:
+   andi    L,  K, 3
+   bge $r0,    L, .L68
+   .align  3
+.L66:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   3 * SIZE
+   LD a1,  AO,   1 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L66
+.L68:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO2,  0 * SIZE
+   LD $f9,  CO2,  1 * SIZE
+   ADD c11, c11, c31
+   ADD c21, c21, c41
+   MADD  $f22, c11, ALPHA_R, $f22
+   MADD  $f8, c11, ALPHA_I, $f8
+   MADD  $f23, c21, ALPHA_R, $f23
+   MADD  $f9, c21, ALPHA_I, $f9
+   ST $f22,  CO1,   0 * SIZE
+   ST $f8,  CO1,   1 * SIZE
+   ST $f23,  CO2,   0 * SIZE
+   ST $f9,  CO2,   1 * SIZE
+   .align 3
+
+.L69:
+   move    B, BO
+   .align 3
+
+.L70:
+   andi    J,  N, 1
+move   AO, A
+   bge $r0,    J, .L999
+   move    CO1, C
+   srai.d  I,  M, 1
+add.d  C,   CO1,    LDC
+   bge $r0,    I, .L80
+.L71:
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  K, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L75
+   .align  3
+.L72:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   2 * SIZE
+   LD a2,  AO,   3 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   6 * SIZE
+   LD a2,  AO,   7 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  8 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L72
+   .align 3
+
+.L75:
+   andi    L,  K, 3
+   bge $r0,    L, .L78
+   .align  3
+.L76:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L76
+.L78:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO1,  2 * SIZE
+   LD $f9,  CO1,  3 * SIZE
+   ADD c11, c11, c21
+   addi.d  I, I, -1
+   ADD c12, c12, c22
+   addi.d  CO1,CO1, 4 * SIZE
+   MADD  $f22, c11, ALPHA_R, $f22
+   MADD  $f8, c11, ALPHA_I, $f8
+   MADD  $f23, c12, ALPHA_R, $f23
+   MADD  $f9, c12, ALPHA_I, $f9
+   ST $f22,  CO1,  -4 * SIZE
+   ST $f8,  CO1,  -3 * SIZE
+   ST $f23,  CO1,  -2 * SIZE
+   ST $f9,  CO1,  -1 * SIZE
+   blt $r0,    I, .L71
+   .align 3
+
+.L80:
+   andi    I,  M, 1
+   bge $r0,    I, .L89
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  K, 2
+move   BO,  B
+   bge $r0,    L, .L85
+   .align  3
+.L82:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   1 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c21, b1, a1, c21
+   LD a1,  AO,   2 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   3 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c21, b1, a1, c21
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L82
+   .align 3
+
+.L85:
+   andi    L,  K, 3
+   bge $r0,    L, .L88
+   .align  3
+.L86:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L86
+.L88:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   ADD c11, c11, c21
+   MADD  $f22, c11, ALPHA_R, $f22
+   MADD  $f8, c11, ALPHA_I, $f8
+   ST $f22,  CO1,   0 * SIZE
+   ST $f8,  CO1,   1 * SIZE
+   .align 3
+
+.L89:
+   move    B, BO
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   fld.d  $f24,  $sp,  48
+   fld.d  $f25,  $sp,  56
+   fld.d  $f26,  $sp,  64
+   fld.d  $f27,  $sp,  72
+   fld.d  $f28,  $sp,  80
+   fld.d  $f29,  $sp,  88
+   addi.d  $sp, $sp, 128
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zgemm_kernel.S b/kernel/loongarch64/zgemm_kernel.S
new file mode 100644
index 000000000..2d50d41a5
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel.S
@@ -0,0 +1,1047 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r25
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+
+#if defined(TRMMKERNEL)
+#define OFFSET $r11
+#define KK     $r26
+#define TEMP   $r27
+#endif
+
+#define a1     $f22
+#define a2     $f8
+#define a3     $f28
+#define a4     $f29
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f3
+#define c22    $f4
+#define c31    $f2
+#define c32    $f5
+#define c41    $f6
+#define c42    $f7
+#define c51    $f18
+#define c52    $f19
+#define c61    $f20
+#define c62    $f21
+#define c71    $f24
+#define c72    $f25
+#define c81    $f26
+#define c82    $f27
+#define ALPHA_R    $f0
+#define ALPHA_I    $f1
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+   PROLOGUE
+
+   addi.d  $sp, $sp,   -128
+   SDARG  $r23,  $sp,  0
+   SDARG  $r24,  $sp,  8
+   SDARG  $r25,  $sp,  64
+   fst.d  $f24,  $sp,  16
+   fst.d  $f25,  $sp,  24
+   fst.d  $f26,  $sp,  32
+   fst.d  $f27,  $sp,  40
+   fst.d  $f28,  $sp,  48
+   fst.d  $f29,  $sp,  56
+#if defined(TRMMKERNEL)
+   SDARG  $r26,  $sp,   72
+   SDARG  $r27,  $sp,   80
+#endif
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  88
+   fst.d  $f19,  $sp,  96
+   fst.d  $f20,  $sp, 104
+   fst.d  $f21,  $sp, 112
+#endif
+   slli.d     LDC,    LDC, ZBASE_SHIFT
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   sub.d   KK, $r0, OFFSET
+#endif
+   srai.d  J,  N, 2
+nop
+   bge $r0,    J, .L20
+.L10:
+   move    CO1, C
+   MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   move    AO, A
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   MOV c31, c11
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+   MOV c41, c11
+   MOV c51, c11
+   move    I,  M
+   add.d   C,   CO4,    LDC
+   MOV    c61, c11
+   bge $r0,    I, .L19
+.L11:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK,  ZBASE_SHIFT
+   slli.d  TEMP, KK, 2 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 4
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L15
+#else
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  K, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#endif
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   MADD3  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD1  c71, b3, a1, c71
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c51, b7, a4, c51
+   MADD3  c61, b2, a4, c61
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD1  c51, b5, a3, c51
+   MADD3  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD1  c71, b3, a3, c71
+   MADD3  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD1  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD3  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   MADD3  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD1  c71, b3, a1, c71
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c51, b7, a4, c51
+   MADD3  c61, b2, a4, c61
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD1  c51, b5, a3, c51
+   MADD3  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD1  c71, b3, a3, c71
+   MADD3  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD1  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD3  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD3  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD1  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+#ifndef TRMMKERNEL
+   LD b1,  CO1,   0 * SIZE
+   ADD c11, c11, c22
+   LD b2,  CO1,   1 * SIZE
+   ADD c12, c12, c21
+   LD b3,  CO2,   0 * SIZE
+   ADD c31, c31, c42
+   LD b4,  CO2,   1 * SIZE
+   ADD c32, c32, c41
+   LD b5,  CO3,   0 * SIZE
+   ADD c51, c51, c62
+   LD b6,  CO3,   1 * SIZE
+   ADD c52, c52, c61
+   LD b7,  CO4,   0 * SIZE
+   ADD c71, c71, c82
+   LD b8,  CO4,   1 * SIZE
+   ADD c72, c72, c81
+   MADD  b1, c11, ALPHA_R, b1
+   addi.d  CO1,CO1, 2 * SIZE
+   MADD  b2, c12, ALPHA_R, b2
+   addi.d  CO2,CO2, 2 * SIZE
+   MADD  b3, c31, ALPHA_R, b3
+   addi.d  CO3,CO3, 2 * SIZE
+   MADD  b4, c32, ALPHA_R, b4
+   addi.d  CO4,CO4, 2 * SIZE
+   MADD  b5, c51, ALPHA_R, b5
+   addi.d  I, I, -1
+   MADD  b6, c52, ALPHA_R, b6
+   MADD  b7, c71, ALPHA_R, b7
+   MADD  b8, c72, ALPHA_R, b8
+   NMSUB  b1, c12, ALPHA_I, b1
+   MADD  b2, c11, ALPHA_I, b2
+   MTC  c11, $r0
+   NMSUB  b3, c32, ALPHA_I, b3
+   MADD  b4, c31, ALPHA_I, b4
+   ST b1,  CO1,  -2 * SIZE
+   NMSUB  b5, c52, ALPHA_I, b5
+   ST b2,  CO1,  -1 * SIZE
+   MADD  b6, c51, ALPHA_I, b6
+   ST b3,  CO2,  -2 * SIZE
+   NMSUB  b7, c72, ALPHA_I, b7
+   ST b4,  CO2,  -1 * SIZE
+   MADD  b8, c71, ALPHA_I, b8
+   ST b5,  CO3,  -2 * SIZE
+   MOV c21, c11
+   ST b6,  CO3,  -1 * SIZE
+   MOV c31, c11
+   ST b7,  CO4,  -2 * SIZE
+   MOV c41, c11
+   ST b8,  CO4,  -1 * SIZE
+   MOV c51, c11
+#else
+   ADD c11, c11, c22
+   addi.d  CO1,CO1, 2 * SIZE
+   ADD c12, c12, c21
+   addi.d  CO2,CO2, 2 * SIZE
+   ADD c31, c31, c42
+   addi.d  CO3,CO3, 2 * SIZE
+   ADD c32, c32, c41
+   addi.d  CO4,CO4, 2 * SIZE
+   ADD c51, c51, c62
+   addi.d  I, I, -1
+   ADD c52, c52, c61
+   ADD c71, c71, c82
+   ADD c72, c72, c81
+   MUL b1, ALPHA_R, c11
+   MUL b2, ALPHA_R, c12
+   MUL b3, ALPHA_R, c31
+   MUL b4, ALPHA_R, c32
+   MUL b5, ALPHA_R, c51
+   MUL b6, ALPHA_R, c52
+   MUL b7, ALPHA_R, c71
+   MUL b8, ALPHA_R, c72
+   NMSUB  b1, c12, ALPHA_I, b1
+   MADD  b2, c11, ALPHA_I, b2
+   MTC  c11, $r0
+   NMSUB  b3, c32, ALPHA_I, b3
+   MADD  b4, c31, ALPHA_I, b4
+   ST b1,  CO1,  -2 * SIZE
+   NMSUB  b5, c52, ALPHA_I, b5
+   ST b2,  CO1,  -1 * SIZE
+   MADD  b6, c51, ALPHA_I, b6
+   ST b3,  CO2,  -2 * SIZE
+   NMSUB  b7, c72, ALPHA_I, b7
+   ST b4,  CO2,  -1 * SIZE
+   MADD  b8, c71, ALPHA_I, b8
+   ST b5,  CO3,  -2 * SIZE
+   MOV c21, c11
+   ST b6,  CO3,  -1 * SIZE
+   MOV c31, c11
+   ST b7,  CO4,  -2 * SIZE
+   MOV c41, c11
+   ST b8,  CO4,  -1 * SIZE
+   MOV c51, c11
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -1
+#else
+   addi.d  TEMP, TEMP, -4
+#endif
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 1
+#endif
+#endif
+MOV    c61, c11
+   blt $r0,    I, .L11
+   .align 3
+
+.L19:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 4
+#endif
+move   B, BO
+   blt $r0,    J, .L10
+   .align 3
+
+.L20:
+   andi    J,  N, 2
+   MTC  c11, $r0
+move   CO1, C
+   bge $r0,    J, .L30
+   add.d   CO2, C,      LDC
+   add.d   C,   CO2,    LDC
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+   move    I,  M
+move   AO, A
+   bge $r0,    I, .L29
+   .align 3
+
+.L21:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK,  ZBASE_SHIFT
+   slli.d  TEMP, KK, 1 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c31, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c41, c11
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   MOV c12, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c22, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c32, c11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 2
+#endif
+   srai.d  L,  TEMP, 2
+MOV    c42, c11
+   bge $r0,    L, .L25
+#else
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  B,   0 * SIZE
+   MOV c31, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c41, c11
+   LD b2,  B,   1 * SIZE
+   srai.d  L,  K, 2
+   LD b3,  B,   2 * SIZE
+   MOV c12, c11
+   LD b4,  B,   3 * SIZE
+   MOV c22, c11
+   LD b5,  B,   4 * SIZE
+   MOV c32, c11
+   MOV c42, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD2  c12, b5, a2, c12
+   LD b5,  BO,  12 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c11, b5, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD2  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  17 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  18 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  19 * SIZE
+addi.d BO, BO, 16 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   addi.d  BO, BO,  4 * SIZE
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   0 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+addi.d AO, AO,  2 * SIZE
+   blt $r0,    L, .L26
+.L28:
+#ifndef TRMMKERNEL
+   LD b1,  CO1,   0 * SIZE
+   ADD c11, c11, c22
+   LD b2,  CO1,   1 * SIZE
+   ADD c12, c12, c21
+   LD b3,  CO2,   0 * SIZE
+   ADD c31, c31, c42
+   LD b4,  CO2,   1 * SIZE
+   ADD c32, c32, c41
+   MADD  b1, c11, ALPHA_R, b1
+   addi.d  CO1,CO1, 2 * SIZE
+   MADD  b2, c12, ALPHA_R, b2
+   addi.d  CO2,CO2, 2 * SIZE
+   MADD  b3, c31, ALPHA_R, b3
+   addi.d  I, I, -1
+   MADD  b4, c32, ALPHA_R, b4
+   NMSUB  b1, c12, ALPHA_I, b1
+   MADD  b2, c11, ALPHA_I, b2
+   MTC  c11, $r0
+   NMSUB  b3, c32, ALPHA_I, b3
+   MADD  b4, c31, ALPHA_I, b4
+   ST b1,  CO1,  -2 * SIZE
+   ST b2,  CO1,  -1 * SIZE
+   ST b3,  CO2,  -2 * SIZE
+#else
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+   ADD c31, c31, c42
+   ADD c32, c32, c41
+   MUL b1, ALPHA_R, c11
+   addi.d  CO1,CO1, 2 * SIZE
+   MUL b2, ALPHA_R, c12
+   addi.d  CO2,CO2, 2 * SIZE
+   MUL b3, ALPHA_R, c31
+   addi.d  I, I, -1
+   MUL b4, ALPHA_R, c32
+   NMSUB  b1, c12, ALPHA_I, b1
+   MADD  b2, c11, ALPHA_I, b2
+   MTC  c11, $r0
+   NMSUB  b3, c32, ALPHA_I, b3
+   MADD  b4, c31, ALPHA_I, b4
+   ST b1,  CO1,  -2 * SIZE
+   ST b2,  CO1,  -1 * SIZE
+   ST b3,  CO2,  -2 * SIZE
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -1
+#else
+   addi.d  TEMP, TEMP, -2
+#endif
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 1
+#endif
+#endif
+   ST b4,  CO2,  -1 * SIZE
+   blt $r0,    I, .L21
+   .align 3
+
+.L29:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 2
+#endif
+   move    B, BO
+   .align 3
+
+.L30:
+   andi    J,  N, 1
+   MTC  c11, $r0
+move   CO1, C
+   bge $r0,    J, .L999
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+   move    I,  M
+   add.d   C,   CO1,    LDC
+move   AO, A
+   bge $r0,    I, .L39
+   .align 3
+
+.L31:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  TEMP, KK,  ZBASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c31, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c41, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c12, c11
+   MOV c22, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c32, c11
+   LD b3,  BO,   4 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 1
+#endif
+   srai.d  L,  TEMP, 2
+MOV    c42, c11
+   bge $r0,    L, .L35
+#else
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  B,   0 * SIZE
+   MOV c31, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c41, c11
+   LD b2,  B,   1 * SIZE
+   MOV c12, c11
+   srai.d  L,  K, 2
+   MOV c22, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c32, c11
+   LD b3,  B,   4 * SIZE
+   MOV c42, c11
+move   BO,  B
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD1  c11, b1, a1, c11
+   LD b4,  BO,   3 * SIZE
+   MADD3  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   3 * SIZE
+   MADD1  c11, b1, a1, c11
+   LD b2,  BO,   5 * SIZE
+   MADD3  c21, b4, a1, c21
+   LD a1,  AO,   8 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b4, a2, c22
+   LD a2,  AO,   5 * SIZE
+   MADD1  c11, b3, a3, c11
+   LD b4,  BO,   7 * SIZE
+   MADD3  c21, b2, a3, c21
+   LD a3,  AO,   6 * SIZE
+   MADD2  c12, b3, a2, c12
+   LD b3,  BO,   6 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   7 * SIZE
+   MADD1  c11, b3, a3, c11
+   LD b2,  BO,   9 * SIZE
+   MADD3  c21, b4, a3, c21
+   LD a3,  AO,  12 * SIZE
+   MADD2  c12, b3, a2, c12
+   LD b3,  BO,  12 * SIZE
+   MADD4  c22, b4, a2, c22
+   LD a2,  AO,   9 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD1  c11, b1, a1, c11
+   addi.d  L, L, -1
+   MADD3  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   3 * SIZE
+   LD b2,  BO,   3 * SIZE
+   addi.d  BO, BO,  2 * SIZE
+addi.d AO, AO,  2 * SIZE
+   blt $r0,    L, .L36
+.L38:
+#ifndef TRMMKERNEL
+   LD b1,  CO1,   0 * SIZE
+   ADD c11, c11, c22
+   LD b2,  CO1,   1 * SIZE
+   ADD c12, c12, c21
+   MADD  b1, c11, ALPHA_R, b1
+   addi.d  CO1,CO1, 2 * SIZE
+   MADD  b2, c12, ALPHA_R, b2
+   addi.d  I, I, -1
+   NMSUB  b1, c12, ALPHA_I, b1
+   MADD  b2, c11, ALPHA_I, b2
+   MTC  c11, $r0
+   ST b1,  CO1,  -2 * SIZE
+   ST b2,  CO1,  -1 * SIZE
+   blt $r0,    I, .L31
+#else
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+   MUL b1, ALPHA_R, c11
+   addi.d  CO1,CO1, 2 * SIZE
+   MUL b2, ALPHA_R, c12
+   addi.d  I, I, -1
+   NMSUB  b1, c12, ALPHA_I, b1
+   MADD  b2, c11, ALPHA_I, b2
+   MTC  c11, $r0
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -1
+#else
+   addi.d  TEMP, TEMP, -1
+#endif
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 1
+#endif
+   ST b1,  CO1,  -2 * SIZE
+   ST b2,  CO1,  -1 * SIZE
+   blt $r0,    I, .L31
+#endif
+   .align 3
+
+.L39:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 1
+#endif
+   move    B, BO
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,  0
+   LDARG  $r24,  $sp,  8
+   LDARG  $r25,  $sp,  64
+   fld.d  $f24,  $sp,  16
+   fld.d  $f25,  $sp,  24
+   fld.d  $f26,  $sp,  32
+   fld.d  $f27,  $sp,  40
+   fld.d  $f28,  $sp,  48
+   fld.d  $f29,  $sp,  56
+#if defined(TRMMKERNEL)
+   LDARG  $r26,  $sp,   72
+   LDARG  $r27,  $sp,   80
+#endif
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  88
+   fld.d  $f19,  $sp,  96
+   fld.d  $f20,  $sp, 104
+   fld.d  $f21,  $sp, 112
+#endif
+   addi.d $sp, $sp, 128
+   move $r4, $r17
+   fmov.d $f0, $f22
+   fmov.d $f1, $f23
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zgemv_n.S b/kernel/loongarch64/zgemv_n.S
new file mode 100644
index 000000000..0cc49c789
--- /dev/null
+++ b/kernel/loongarch64/zgemv_n.S
@@ -0,0 +1,648 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M          $r4
+#define N          $r5
+#define A          $r7
+#define LDA        $r8
+#define X          $r9
+#define INCX       $r10
+#define Y          $r11
+#define INCY       $r6
+#define BUFFER     $r17
+
+#define YORIG      $r18
+#define XX         $r12
+#define YY         $r13
+#define I          $r14
+#define J          $r15
+#define AO1        $r23
+#define AO2        $r24
+
+#define ALPHA_R    $f0
+#define ALPHA_I    $f1
+#define a1         $f22
+#define a2         $f8
+#define a3         $f23
+#define a4         $f9
+#define a5         $f10
+#define a6         $f11
+#define a7         $f12
+#define a8         $f13
+#define x1         $f14
+#define x2         $f15
+#define x3         $f16
+#define x4         $f17
+#define y1         $f3
+#define y2         $f4
+#define y3         $f2
+#define y4         $f5
+#define t1         $f6
+#define t2         $f7
+#define t3         $f18
+#define t4         $f19
+#define t5         $f20
+#define t6         $f21
+#define t7         $f24
+#define t8         $f25
+
+#if !defined(CONJ) && !defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        MADD
+#define    MADD3        NMSUB
+#define    MADD4        MADD
+#endif
+#if  defined(CONJ) && !defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        MADD
+#define    MADD3        MADD
+#define    MADD4        NMSUB
+#endif
+#if  !defined(CONJ) && defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        NMSUB
+#define    MADD3        MADD
+#define    MADD4        MADD
+#endif
+#if  defined(CONJ) && defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        NMSUB
+#define    MADD3        NMSUB
+#define    MADD4        NMSUB
+#endif
+
+   PROLOGUE
+
+   LDARG  INCY,    $sp,  0
+   LDARG  BUFFER,  $sp,  8
+#ifndef __64BIT__
+   addi.d  $sp, $sp, -64
+#else
+   addi.d  $sp, $sp, -32
+#endif
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   fst.d  $f24,  $sp,  16
+   fst.d  $f25,  $sp,  24
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  32
+   fst.d  $f19,  $sp,  40
+   fst.d  $f20,  $sp,  48
+   fst.d  $f21,  $sp,  56
+#endif
+   slli.d     LDA,     LDA,  ZBASE_SHIFT
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    M, .L999
+   slli.d INCY, INCY, ZBASE_SHIFT
+   bge $r0,    N, .L999
+   li  I, 2 * SIZE
+   move   YORIG, Y
+   beq INCY, I, .L10
+   srai.d  I,  M, 2
+   move    YORIG, BUFFER
+   move    XX, Y
+   move   YY, BUFFER
+   bge $r0,    I, .L05
+   .align 3
+
+.L02:
+   LD a1,  XX,  0 * SIZE
+   LD a2,  XX,  1 * SIZE
+   add.d   XX, XX, INCY
+   LD a3,  XX,  0 * SIZE
+   LD a4,  XX,  1 * SIZE
+   add.d   XX, XX, INCY
+   LD a5,  XX,  0 * SIZE
+   LD a6,  XX,  1 * SIZE
+   add.d   XX, XX, INCY
+   LD a7,  XX,  0 * SIZE
+   LD a8,  XX,  1 * SIZE
+   add.d   XX, XX, INCY
+   addi.d  I, I, -1
+   addi.d  YY, YY, 8 * SIZE
+   ST a1,  YY,  -8 * SIZE
+   ST a2,  YY,  -7 * SIZE
+   ST a3,  YY,  -6 * SIZE
+   ST a4,  YY,  -5 * SIZE
+   ST a5,  YY,  -4 * SIZE
+   ST a6,  YY,  -3 * SIZE
+   ST a7,  YY,  -2 * SIZE
+   ST a8,  YY,  -1 * SIZE
+   blt $r0,    I, .L02
+   .align 3
+
+.L05:
+   andi    I,  M, 3
+   bge $r0,    I, .L10
+   .align 3
+
+.L06:
+   LD a1,  XX,  0 * SIZE
+   LD a2,  XX,  1 * SIZE
+   add.d   XX, XX, INCY
+   addi.d  I, I, -1
+   ST a1,  YY,  0 * SIZE
+   ST a2,  YY,  1 * SIZE
+   addi.d YY, YY, 2 * SIZE
+   blt $r0,    I, .L06
+   .align 3
+
+.L10:
+   srai.d  J,  N, 1
+   bge $r0,    J, .L20
+   .align 3
+
+.L11:
+   LD x1,  X,  0 * SIZE
+   LD x2,  X,  1 * SIZE
+   add.d   X, X, INCX
+   LD x3,  X,  0 * SIZE
+   LD x4,  X,  1 * SIZE
+   add.d   X, X, INCX
+   MUL a1, ALPHA_R, x1
+   move    AO1, A
+   MUL a2, ALPHA_I, x1
+   add.d   AO2, A,      LDA
+   MUL a3, ALPHA_R, x3
+   add.d   A,   AO2,    LDA
+   MUL a4, ALPHA_I, x3
+#ifndef XCONJ
+   NMSUB  x1, x2, ALPHA_I, a1
+   MADD  x2, x2, ALPHA_R, a2
+   NMSUB  x3, x4, ALPHA_I, a3
+   MADD  x4, x4, ALPHA_R, a4
+#else
+   MADD  x1, x2, ALPHA_I, a1
+   MSUB  x2, x2, ALPHA_R, a2
+   MADD  x3, x4, ALPHA_I, a3
+   MSUB  x4, x4, ALPHA_R, a4
+#endif
+   srai.d  I,  M, 2
+   move   YY, YORIG
+   bge $r0,    I, .L15
+   LD y1,  YY,  0 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y4,  YY,  3 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD a5,  AO2,  0 * SIZE
+   LD a6,  AO2,  1 * SIZE
+   LD a7,  AO2,  2 * SIZE
+   LD a8,  AO2,  3 * SIZE
+   MADD1  t1, a1, x1, y1
+   LD y1,  YY,   4 * SIZE
+   MADD2  t2, a1, x2, y2
+   LD a1,  AO1,   4 * SIZE
+   MADD1  t3, a3, x1, y3
+   LD y2,  YY,   5 * SIZE
+   MADD2  t4, a3, x2, y4
+   LD a3,  AO1,   6 * SIZE
+   MADD3  t1, a2, x2, t1
+   LD y3,  YY,   6 * SIZE
+   MADD4  t2, a2, x1, t2
+   LD a2,  AO1,   5 * SIZE
+   MADD3  t3, a4, x2, t3
+   LD y4,  YY,   7 * SIZE
+   MADD4  t4, a4, x1, t4
+   LD a4,  AO1,   7 * SIZE
+   MADD1  t1, a5, x3, t1
+   MADD2  t2, a5, x4, t2
+   LD a5,  AO2,   4 * SIZE
+   MADD1  t3, a7, x3, t3
+   MADD2  t4, a7, x4, t4
+   LD a7,  AO2,   6 * SIZE
+   MADD3  t1, a6, x4, t1
+   MADD4  t2, a6, x3, t2
+   LD a6,  AO2,   5 * SIZE
+   MADD3  t3, a8, x4, t3
+   addi.d  I, I, -1
+   MADD4  t4, a8, x3, t4
+   LD a8,  AO2,   7 * SIZE
+   bge $r0,    I, .L13
+   .align  3
+.L12:
+   MADD1  t5, a1, x1, y1
+   LD y1,  YY,   8 * SIZE
+   MADD2  t6, a1, x2, y2
+   LD a1,  AO1,   8 * SIZE
+   MADD1  t7, a3, x1, y3
+   LD y2,  YY,   9 * SIZE
+   MADD2  t8, a3, x2, y4
+   LD a3,  AO1,  10 * SIZE
+   MADD3  t5, a2, x2, t5
+   LD y3,  YY,  10 * SIZE
+   MADD4  t6, a2, x1, t6
+   LD a2,  AO1,   9 * SIZE
+   MADD3  t7, a4, x2, t7
+   LD y4,  YY,  11 * SIZE
+   MADD4  t8, a4, x1, t8
+   LD a4,  AO1,  11 * SIZE
+   MADD1  t5, a5, x3, t5
+   ST t1,  YY,   0 * SIZE
+   MADD2  t6, a5, x4, t6
+   LD a5,  AO2,   8 * SIZE
+   MADD1  t7, a7, x3, t7
+   ST t2,  YY,   1 * SIZE
+   MADD2  t8, a7, x4, t8
+   LD a7,  AO2,  10 * SIZE
+   MADD3  t5, a6, x4, t5
+   ST t3,  YY,   2 * SIZE
+   MADD4  t6, a6, x3, t6
+   LD a6,  AO2,   9 * SIZE
+   MADD3  t7, a8, x4, t7
+   ST t4,  YY,   3 * SIZE
+   MADD4  t8, a8, x3, t8
+   LD a8,  AO2,  11 * SIZE
+   MADD1  t1, a1, x1, y1
+   LD y1,  YY,  12 * SIZE
+   MADD2  t2, a1, x2, y2
+   LD a1,  AO1,  12 * SIZE
+   MADD1  t3, a3, x1, y3
+   LD y2,  YY,  13 * SIZE
+   MADD2  t4, a3, x2, y4
+   LD a3,  AO1,  14 * SIZE
+   MADD3  t1, a2, x2, t1
+   LD y3,  YY,  14 * SIZE
+   MADD4  t2, a2, x1, t2
+   LD a2,  AO1,  13 * SIZE
+   MADD3  t3, a4, x2, t3
+   LD y4,  YY,  15 * SIZE
+   MADD4  t4, a4, x1, t4
+   LD a4,  AO1,  15 * SIZE
+   MADD1  t1, a5, x3, t1
+   ST t5,  YY,   4 * SIZE
+   MADD2  t2, a5, x4, t2
+   LD a5,  AO2,  12 * SIZE
+   MADD1  t3, a7, x3, t3
+   ST t6,  YY,   5 * SIZE
+   MADD2  t4, a7, x4, t4
+   LD a7,  AO2,  14 * SIZE
+   MADD3  t1, a6, x4, t1
+   ST t7,  YY,   6 * SIZE
+   MADD4  t2, a6, x3, t2
+   LD a6,  AO2,  13 * SIZE
+   MADD3  t3, a8, x4, t3
+   ST t8,  YY,   7 * SIZE
+   MADD4  t4, a8, x3, t4
+   LD a8,  AO2,  15 * SIZE
+   addi.d  I, I, -1
+   addi.d  YY,  YY,   8 * SIZE
+   addi.d  AO1, AO1,  8 * SIZE
+   addi.d AO2, AO2,  8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   ST t1,  YY,   0 * SIZE
+   MADD1  t1, a1, x1, y1
+   ST t2,  YY,   1 * SIZE
+   MADD2  t2, a1, x2, y2
+   ST t3,  YY,   2 * SIZE
+   MADD1  t3, a3, x1, y3
+   ST t4,  YY,   3 * SIZE
+   MADD2  t4, a3, x2, y4
+   MADD3  t1, a2, x2, t1
+   MADD4  t2, a2, x1, t2
+   MADD3  t3, a4, x2, t3
+   MADD4  t4, a4, x1, t4
+   MADD1  t1, a5, x3, t1
+   MADD2  t2, a5, x4, t2
+   MADD1  t3, a7, x3, t3
+   MADD2  t4, a7, x4, t4
+   MADD3  t1, a6, x4, t1
+   addi.d  AO1, AO1,  8 * SIZE
+   MADD4  t2, a6, x3, t2
+   addi.d  AO2, AO2,  8 * SIZE
+   MADD3  t3, a8, x4, t3
+   addi.d  YY,  YY,   8 * SIZE
+   MADD4  t4, a8, x3, t4
+   ST t1,  YY,  -4 * SIZE
+   ST t2,  YY,  -3 * SIZE
+   ST t3,  YY,  -2 * SIZE
+   ST t4,  YY,  -1 * SIZE
+   .align 3
+
+.L15:
+   andi    I,  M, 2
+   bge $r0,    I, .L16
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD y4,  YY,  3 * SIZE
+   MADD1  t1, a1, x1, y1
+   LD a5,  AO2,  0 * SIZE
+   MADD2  t2, a1, x2, y2
+   LD a6,  AO2,  1 * SIZE
+   MADD1  t3, a3, x1, y3
+   LD a7,  AO2,  2 * SIZE
+   MADD2  t4, a3, x2, y4
+   LD a8,  AO2,  3 * SIZE
+   MADD3  t1, a2, x2, t1
+   MADD4  t2, a2, x1, t2
+   MADD3  t3, a4, x2, t3
+   MADD4  t4, a4, x1, t4
+   MADD1  t1, a5, x3, t1
+   MADD2  t2, a5, x4, t2
+   MADD1  t3, a7, x3, t3
+   MADD2  t4, a7, x4, t4
+   MADD3  t1, a6, x4, t1
+   addi.d  YY,  YY,   4 * SIZE
+   MADD4  t2, a6, x3, t2
+   addi.d  AO1, AO1,  4 * SIZE
+   MADD3  t3, a8, x4, t3
+   addi.d  AO2, AO2,  4 * SIZE
+   MADD4  t4, a8, x3, t4
+   ST t1,  YY,  -4 * SIZE
+   ST t2,  YY,  -3 * SIZE
+   ST t3,  YY,  -2 * SIZE
+   ST t4,  YY,  -1 * SIZE
+   .align 3
+
+.L16:
+   andi    I,  M, 1
+   bge $r0,    I, .L19
+   LD y1,  YY,  0 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   MADD1  t1, a1, x1, y1
+   LD a5,  AO2,  0 * SIZE
+   MADD2  t2, a1, x2, y2
+   LD a6,  AO2,  1 * SIZE
+   MADD3  t1, a2, x2, t1
+   MADD4  t2, a2, x1, t2
+   MADD1  t1, a5, x3, t1
+   MADD2  t2, a5, x4, t2
+   MADD3  t1, a6, x4, t1
+   MADD4  t2, a6, x3, t2
+   ST t1,  YY,   0 * SIZE
+   ST t2,  YY,   1 * SIZE
+   .align 3
+
+.L19:
+   addi.d  J, J, -1
+   blt $r0,    J, .L11
+   .align 3
+
+.L20:
+   andi    J,  N, 1
+   bge $r0,    J, .L900
+   LD x1,  X,  0 * SIZE
+   LD x2,  X,  1 * SIZE
+   add.d   X, X, INCX
+   MUL a1, ALPHA_R, x1
+   move    AO1, A
+   MUL a2, ALPHA_I, x1
+#ifndef XCONJ
+   NMSUB  x1, x2, ALPHA_I, a1
+   MADD  x2, x2, ALPHA_R, a2
+#else
+   MADD  x1, x2, ALPHA_I, a1
+   MSUB  x2, x2, ALPHA_R, a2
+#endif
+   srai.d  I,  M, 2
+   move   YY, YORIG
+   bge $r0,    I, .L25
+   LD y1,  YY,  0 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y4,  YY,  3 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   MADD1  t1, a1, x1, y1
+   LD y1,  YY,   4 * SIZE
+   MADD2  t2, a1, x2, y2
+   LD a1,  AO1,   4 * SIZE
+   MADD1  t3, a3, x1, y3
+   LD y2,  YY,   5 * SIZE
+   MADD2  t4, a3, x2, y4
+   LD a3,  AO1,   6 * SIZE
+   MADD3  t1, a2, x2, t1
+   LD y3,  YY,   6 * SIZE
+   MADD4  t2, a2, x1, t2
+   LD a2,  AO1,   5 * SIZE
+   MADD3  t3, a4, x2, t3
+   LD y4,  YY,   7 * SIZE
+   MADD4  t4, a4, x1, t4
+   addi.d  I, I, -1
+   LD a4,  AO1,   7 * SIZE
+   bge $r0,    I, .L23
+   .align  3
+.L22:
+   MADD1  t5, a1, x1, y1
+   LD y1,  YY,   8 * SIZE
+   MADD2  t6, a1, x2, y2
+   LD a1,  AO1,   8 * SIZE
+   MADD1  t7, a3, x1, y3
+   LD y2,  YY,   9 * SIZE
+   MADD2  t8, a3, x2, y4
+   LD a3,  AO1,  10 * SIZE
+   MADD3  t5, a2, x2, t5
+   LD y3,  YY,  10 * SIZE
+   MADD4  t6, a2, x1, t6
+   LD a2,  AO1,   9 * SIZE
+   MADD3  t7, a4, x2, t7
+   LD y4,  YY,  11 * SIZE
+   MADD4  t8, a4, x1, t8
+   LD a4,  AO1,  11 * SIZE
+   ST t1,  YY,   0 * SIZE
+   ST t2,  YY,   1 * SIZE
+   ST t3,  YY,   2 * SIZE
+   ST t4,  YY,   3 * SIZE
+   MADD1  t1, a1, x1, y1
+   LD y1,  YY,  12 * SIZE
+   MADD2  t2, a1, x2, y2
+   LD a1,  AO1,  12 * SIZE
+   MADD1  t3, a3, x1, y3
+   LD y2,  YY,  13 * SIZE
+   MADD2  t4, a3, x2, y4
+   LD a3,  AO1,  14 * SIZE
+   MADD3  t1, a2, x2, t1
+   LD y3,  YY,  14 * SIZE
+   MADD4  t2, a2, x1, t2
+   LD a2,  AO1,  13 * SIZE
+   MADD3  t3, a4, x2, t3
+   LD y4,  YY,  15 * SIZE
+   MADD4  t4, a4, x1, t4
+   LD a4,  AO1,  15 * SIZE
+   ST t5,  YY,   4 * SIZE
+   ST t6,  YY,   5 * SIZE
+   ST t7,  YY,   6 * SIZE
+   ST t8,  YY,   7 * SIZE
+   addi.d  I, I, -1
+   addi.d  YY,  YY,   8 * SIZE
+   addi.d AO1, AO1,  8 * SIZE
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   ST t1,  YY,   0 * SIZE
+   MADD1  t1, a1, x1, y1
+   ST t2,  YY,   1 * SIZE
+   MADD2  t2, a1, x2, y2
+   ST t3,  YY,   2 * SIZE
+   MADD1  t3, a3, x1, y3
+   ST t4,  YY,   3 * SIZE
+   MADD2  t4, a3, x2, y4
+   MADD3  t1, a2, x2, t1
+   addi.d  AO1, AO1,  8 * SIZE
+   MADD4  t2, a2, x1, t2
+   addi.d  YY,  YY,   8 * SIZE
+   MADD3  t3, a4, x2, t3
+   MADD4  t4, a4, x1, t4
+   ST t1,  YY,  -4 * SIZE
+   ST t2,  YY,  -3 * SIZE
+   ST t3,  YY,  -2 * SIZE
+   ST t4,  YY,  -1 * SIZE
+   .align 3
+
+.L25:
+   andi    I,  M, 2
+   bge $r0,    I, .L26
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD y4,  YY,  3 * SIZE
+   MADD1  t1, a1, x1, y1
+   MADD2  t2, a1, x2, y2
+   MADD1  t3, a3, x1, y3
+   MADD2  t4, a3, x2, y4
+   MADD3  t1, a2, x2, t1
+   addi.d  YY,  YY,   4 * SIZE
+   MADD4  t2, a2, x1, t2
+   addi.d  AO1, AO1,  4 * SIZE
+   MADD3  t3, a4, x2, t3
+   MADD4  t4, a4, x1, t4
+   ST t1,  YY,  -4 * SIZE
+   ST t2,  YY,  -3 * SIZE
+   ST t3,  YY,  -2 * SIZE
+   ST t4,  YY,  -1 * SIZE
+   .align 3
+
+.L26:
+   andi    I,  M, 1
+   bge $r0,    I, .L900
+   LD y1,  YY,  0 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   MADD1  t1, a1, x1, y1
+   MADD2  t2, a1, x2, y2
+   MADD3  t1, a2, x2, t1
+   MADD4  t2, a2, x1, t2
+   ST t1,  YY,   0 * SIZE
+   ST t2,  YY,   1 * SIZE
+   .align 3
+
+.L900:
+   li  YORIG, 2 * SIZE
+   srai.d I,  M, 2
+   beq INCY, YORIG, .L999
+   move   XX, BUFFER
+   bge $r0,    I, .L905
+   .align 3
+
+.L902:
+   LD a1,  XX,  0 * SIZE
+   LD a2,  XX,  1 * SIZE
+   LD a3,  XX,  2 * SIZE
+   LD a4,  XX,  3 * SIZE
+   LD a5,  XX,  4 * SIZE
+   LD a6,  XX,  5 * SIZE
+   LD a7,  XX,  6 * SIZE
+   LD a8,  XX,  7 * SIZE
+   addi.d  I, I, -1
+   ST a1,  Y,  0 * SIZE
+   ST a2,  Y,  1 * SIZE
+   add.d   Y, Y, INCY
+   ST a3,  Y,  0 * SIZE
+   ST a4,  Y,  1 * SIZE
+   add.d   Y, Y, INCY
+   ST a5,  Y,  0 * SIZE
+   ST a6,  Y,  1 * SIZE
+   add.d   Y, Y, INCY
+   ST a7,  Y,  0 * SIZE
+   ST a8,  Y,  1 * SIZE
+   add.d   Y, Y, INCY
+   addi.d XX, XX, 8 * SIZE
+   blt $r0,    I, .L902
+   .align 3
+
+.L905:
+   andi    I,  M, 3
+   bge $r0,    I, .L999
+   .align 3
+
+.L906:
+   LD a1,  XX,  0 * SIZE
+   LD a2,  XX,  1 * SIZE
+   addi.d  XX, XX, 2 * SIZE
+   addi.d  I, I, -1
+   ST a1,  Y,  0 * SIZE
+   ST a2,  Y,  1 * SIZE
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L906
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   fld.d  $f24,  $sp,  16
+   fld.d  $f25,  $sp,  24
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  32
+   fld.d  $f19,  $sp,  40
+   fld.d  $f20,  $sp,  48
+   fld.d  $f21,  $sp,  56
+#endif
+#ifdef __64BIT__
+   addi.d  $sp, $sp, 32
+#else
+   addi.d  $sp, $sp, 64
+#endif
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zgemv_t.S b/kernel/loongarch64/zgemv_t.S
new file mode 100644
index 000000000..85a9a0c0d
--- /dev/null
+++ b/kernel/loongarch64/zgemv_t.S
@@ -0,0 +1,556 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M          $r4
+#define N          $r5
+#define A          $r7
+#define LDA        $r8
+#define X          $r9
+#define INCX       $r10
+#define Y          $r11
+#define INCY       $r6
+#define BUFFER     $r17
+
+#define XORIG      $r18
+#define XX         $r12
+#define YY         $r13
+#define I          $r14
+#define J          $r15
+#define AO1        $r23
+#define AO2        $r24
+
+#define ALPHA_R    $f0
+#define ALPHA_I    $f1
+#define a1         $f22
+#define a2         $f8
+#define a3         $f23
+#define a4         $f9
+#define a5         $f10
+#define a6         $f11
+#define a7         $f12
+#define a8         $f13
+#define y1         $f14
+#define y2         $f15
+#define y3         $f16
+#define y4         $f17
+#define x1         $f3
+#define x2         $f4
+#define x3         $f2
+#define x4         $f5
+#define x5         $f6
+#define x6         $f7
+#define x7         $f18
+#define x8         $f19
+
+#if !defined(CONJ) && !defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        MADD
+#define    MADD3        NMSUB
+#define    MADD4        MADD
+#endif
+#if  defined(CONJ) && !defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        MADD
+#define    MADD3        MADD
+#define    MADD4        NMSUB
+#endif
+#if  !defined(CONJ) && defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        NMSUB
+#define    MADD3        MADD
+#define    MADD4        MADD
+#endif
+#if  defined(CONJ) && defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        NMSUB
+#define    MADD3        NMSUB
+#define    MADD4        NMSUB
+#endif
+
+   PROLOGUE
+
+   LDARG  INCY,    $sp,  0
+   LDARG  BUFFER,  $sp,  8
+#ifdef __64BIT__
+   addi.d  $sp, $sp, -16
+#else
+   addi.d  $sp, $sp, -32
+#endif
+   MTC  y1, $r0
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   slli.d     LDA,     LDA,  ZBASE_SHIFT
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  16
+   fst.d  $f19,  $sp,  24
+#endif
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    M, .L999
+   slli.d INCY, INCY, ZBASE_SHIFT
+   bge $r0,    N, .L999
+   li  I, 2 * SIZE
+   move   XORIG, X
+   beq INCX, I, .L10
+   srai.d  I,  M, 2
+   move    XORIG, BUFFER
+   move   YY, BUFFER
+   bge $r0,    I, .L05
+   .align 3
+
+.L02:
+   LD a1,  X,  0 * SIZE
+   LD a2,  X,  1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,  0 * SIZE
+   LD a4,  X,  1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,  0 * SIZE
+   LD a6,  X,  1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,  0 * SIZE
+   LD a8,  X,  1 * SIZE
+   add.d   X, X, INCX
+   addi.d  I, I, -1
+   addi.d  YY, YY, 8 * SIZE
+   ST a1,  YY,  -8 * SIZE
+   ST a2,  YY,  -7 * SIZE
+   ST a3,  YY,  -6 * SIZE
+   ST a4,  YY,  -5 * SIZE
+   ST a5,  YY,  -4 * SIZE
+   ST a6,  YY,  -3 * SIZE
+   ST a7,  YY,  -2 * SIZE
+   ST a8,  YY,  -1 * SIZE
+   blt $r0,    I, .L02
+   .align 3
+
+.L05:
+   andi    I,  M, 3
+   bge $r0,    I, .L10
+   .align 3
+
+.L06:
+   LD a1,  X,  0 * SIZE
+   LD a2,  X,  1 * SIZE
+   add.d   X, X, INCX
+   ST a1,  YY,  0 * SIZE
+   ST a2,  YY,  1 * SIZE
+   addi.d  I, I, -1
+   addi.d YY, YY, 2 * SIZE
+   blt $r0,    I, .L06
+   .align 3
+
+.L10:
+   srai.d  J,  N, 1
+   move   YY, Y
+   bge $r0,    J, .L20
+   .align 3
+
+.L11:
+   move    AO1, A
+   MOV y2, y1
+   add.d   AO2, A,      LDA
+   MOV y3, y1
+   add.d   A,   AO2,    LDA
+   MOV y4, y1
+   srai.d  I,  M, 2
+   move   XX, XORIG
+   bge $r0,    I, .L15
+   LD x1,  XX,  0 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD x4,  XX,  3 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a3,  AO2,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD a4,  AO2,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD a7,  AO2,  2 * SIZE
+   LD a6,  AO1,  3 * SIZE
+   LD a8,  AO2,  3 * SIZE
+   addi.d  I, I, -1
+   bge $r0,    I, .L13
+   .align  3
+.L12:
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  2 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a1,  AO1,  4 * SIZE
+   MADD1  y3, a3, x1, y3
+   MADD2  y4, a3, x2, y4
+   LD a3,  AO2,  4 * SIZE
+   MADD3  y1, a2, x2, y1
+   MADD4  y2, a2, x1, y2
+   LD a2,  AO1,  5 * SIZE
+   MADD3  y3, a4, x2, y3
+   LD x2,  XX,  5 * SIZE
+   MADD4  y4, a4, x1, y4
+   LD a4,  AO2,  5 * SIZE
+   MADD1  y1, a5, x3, y1
+   LD x1,  XX,  4 * SIZE
+   MADD2  y2, a5, x4, y2
+   LD a5,  AO1,  6 * SIZE
+   MADD1  y3, a7, x3, y3
+   MADD2  y4, a7, x4, y4
+   LD a7,  AO2,  6 * SIZE
+   MADD3  y1, a6, x4, y1
+   addi.d  I, I, -1
+   MADD4  y2, a6, x3, y2
+   LD a6,  AO1,  7 * SIZE
+   MADD3  y3, a8, x4, y3
+   LD x4,  XX,  7 * SIZE
+   MADD4  y4, a8, x3, y4
+   LD a8,  AO2,  7 * SIZE
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  6 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a1,  AO1,   8 * SIZE
+   MADD1  y3, a3, x1, y3
+   MADD2  y4, a3, x2, y4
+   LD a3,  AO2,   8 * SIZE
+   MADD3  y1, a2, x2, y1
+   MADD4  y2, a2, x1, y2
+   LD a2,  AO1,   9 * SIZE
+   MADD3  y3, a4, x2, y3
+   LD x2,  XX,   9 * SIZE
+   MADD4  y4, a4, x1, y4
+   LD a4,  AO2,   9 * SIZE
+   MADD1  y1, a5, x3, y1
+   LD x1,  XX,   8 * SIZE
+   MADD2  y2, a5, x4, y2
+   LD a5,  AO1,  10 * SIZE
+   MADD1  y3, a7, x3, y3
+   addi.d  XX,  XX,   8 * SIZE
+   MADD2  y4, a7, x4, y4
+   LD a7,  AO2,  10 * SIZE
+   MADD3  y1, a6, x4, y1
+   addi.d  AO2, AO2,  8 * SIZE
+   MADD4  y2, a6, x3, y2
+   LD a6,  AO1,  11 * SIZE
+   MADD3  y3, a8, x4, y3
+   LD x4,  XX,   3 * SIZE
+   MADD4  y4, a8, x3, y4
+   LD a8,  AO2,   3 * SIZE
+   addi.d AO1, AO1,  8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,   2 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a1,  AO1,  4 * SIZE
+   MADD1  y3, a3, x1, y3
+   MADD2  y4, a3, x2, y4
+   LD a3,  AO2,  4 * SIZE
+   MADD3  y1, a2, x2, y1
+   MADD4  y2, a2, x1, y2
+   LD a2,  AO1,  5 * SIZE
+   MADD3  y3, a4, x2, y3
+   LD x2,  XX,  5 * SIZE
+   MADD4  y4, a4, x1, y4
+   LD a4,  AO2,  5 * SIZE
+   MADD1  y1, a5, x3, y1
+   LD x1,  XX,  4 * SIZE
+   MADD2  y2, a5, x4, y2
+   LD a5,  AO1,  6 * SIZE
+   MADD1  y3, a7, x3, y3
+   MADD2  y4, a7, x4, y4
+   LD a7,  AO2,  6 * SIZE
+   MADD3  y1, a6, x4, y1
+   MADD4  y2, a6, x3, y2
+   LD a6,  AO1,  7 * SIZE
+   MADD3  y3, a8, x4, y3
+   LD x4,  XX,  7 * SIZE
+   MADD4  y4, a8, x3, y4
+   LD a8,  AO2,  7 * SIZE
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  6 * SIZE
+   MADD2  y2, a1, x2, y2
+   MADD1  y3, a3, x1, y3
+   MADD2  y4, a3, x2, y4
+   MADD3  y1, a2, x2, y1
+   MADD4  y2, a2, x1, y2
+   MADD3  y3, a4, x2, y3
+   MADD4  y4, a4, x1, y4
+   MADD1  y1, a5, x3, y1
+   MADD2  y2, a5, x4, y2
+   MADD1  y3, a7, x3, y3
+   MADD2  y4, a7, x4, y4
+   MADD3  y1, a6, x4, y1
+   addi.d  XX,  XX,   8 * SIZE
+   MADD4  y2, a6, x3, y2
+   addi.d  AO1, AO1,  8 * SIZE
+   MADD3  y3, a8, x4, y3
+   addi.d  AO2, AO2,  8 * SIZE
+   MADD4  y4, a8, x3, y4
+   .align 3
+
+.L15:
+   andi    I,  M, 2
+   bge $r0,    I, .L17
+   LD x1,  XX,  0 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD x3,  XX,  2 * SIZE
+   LD x4,  XX,  3 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a3,  AO2,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD a4,  AO2,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD a7,  AO2,  2 * SIZE
+   LD a6,  AO1,  3 * SIZE
+   LD a8,  AO2,  3 * SIZE
+   MADD1  y1, a1, x1, y1
+   MADD2  y2, a1, x2, y2
+   MADD1  y3, a3, x1, y3
+   MADD2  y4, a3, x2, y4
+   MADD3  y1, a2, x2, y1
+   MADD4  y2, a2, x1, y2
+   MADD3  y3, a4, x2, y3
+   MADD4  y4, a4, x1, y4
+   MADD1  y1, a5, x3, y1
+   MADD2  y2, a5, x4, y2
+   MADD1  y3, a7, x3, y3
+   MADD2  y4, a7, x4, y4
+   MADD3  y1, a6, x4, y1
+   addi.d  XX,  XX,   4 * SIZE
+   MADD4  y2, a6, x3, y2
+   addi.d  AO1, AO1,  4 * SIZE
+   MADD3  y3, a8, x4, y3
+   addi.d  AO2, AO2,  4 * SIZE
+   MADD4  y4, a8, x3, y4
+   .align 3
+
+.L17:
+   andi    I,  M, 1
+.align 3
+
+   bge $r0,    I, .L19
+.L18:
+   LD x1,  XX,  0 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a3,  AO2,  0 * SIZE
+   MADD1  y1, a1, x1, y1
+   LD a2,  AO1,  1 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a4,  AO2,  1 * SIZE
+   MADD1  y3, a3, x1, y3
+   MADD2  y4, a3, x2, y4
+   MADD3  y1, a2, x2, y1
+   MADD4  y2, a2, x1, y2
+   MADD3  y3, a4, x2, y3
+   MADD4  y4, a4, x1, y4
+   .align 3
+
+.L19:
+   LD a1,  Y,  0 * SIZE
+   LD a2,  Y,  1 * SIZE
+   add.d   Y, Y, INCY
+   LD a3,  Y,  0 * SIZE
+   LD a4,  Y,  1 * SIZE
+   add.d   Y, Y, INCY
+   MADD  a1, y1, ALPHA_R, a1
+   MADD  a2, y1, ALPHA_I, a2
+   MADD  a3, y3, ALPHA_R, a3
+   MADD  a4, y3, ALPHA_I, a4
+   NMSUB  a1, y2, ALPHA_I, a1
+   MADD  a2, y2, ALPHA_R, a2
+   NMSUB  a3, y4, ALPHA_I, a3
+   MTC  y1, $r0
+   MADD  a4, y4, ALPHA_R, a4
+   addi.d  J, J, -1
+   ST a1,  YY,   0 * SIZE
+   ST a2,  YY,   1 * SIZE
+   add.d   YY, YY, INCY
+   ST a3,  YY,   0 * SIZE
+   ST a4,  YY,   1 * SIZE
+   add.d  YY, YY, INCY
+   blt $r0,    J, .L11
+   .align 3
+
+.L20:
+   andi    J,  N, 1
+   MOV y2, y1
+   srai.d I,  M, 2
+   bge $r0,    J, .L999
+   MOV y3, y1
+   move    AO1, A
+   MOV y4, y1
+   move   XX, XORIG
+   bge $r0,    I, .L25
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD x4,  XX,  3 * SIZE
+   addi.d  I, I, -1
+   LD a6,  AO1,  3 * SIZE
+   bge $r0,    I, .L23
+   .align  3
+.L22:
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  2 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a1,  AO1,  4 * SIZE
+   MADD3  y3, a2, x2, y3
+   LD x2,  XX,  5 * SIZE
+   MADD4  y4, a2, x1, y4
+   LD a2,  AO1,  5 * SIZE
+   MADD1  y1, a5, x3, y1
+   LD x1,  XX,  4 * SIZE
+   MADD2  y2, a5, x4, y2
+   LD a5,  AO1,  6 * SIZE
+   MADD3  y3, a6, x4, y3
+   LD x4,  XX,  7 * SIZE
+   MADD4  y4, a6, x3, y4
+   LD a6,  AO1,  7 * SIZE
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  6 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a1,  AO1,   8 * SIZE
+   MADD3  y3, a2, x2, y3
+   LD x2,  XX,   9 * SIZE
+   MADD4  y4, a2, x1, y4
+   LD a2,  AO1,   9 * SIZE
+   MADD1  y1, a5, x3, y1
+   LD x1,  XX,   8 * SIZE
+   MADD2  y2, a5, x4, y2
+   LD a5,  AO1,  10 * SIZE
+   MADD3  y3, a6, x4, y3
+   LD x4,  XX,  11 * SIZE
+   MADD4  y4, a6, x3, y4
+   LD a6,  AO1,  11 * SIZE
+   addi.d  I, I, -1
+   addi.d  XX,  XX,   8 * SIZE
+   addi.d AO1, AO1,  8 * SIZE
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,   2 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a1,  AO1,  4 * SIZE
+   MADD3  y3, a2, x2, y3
+   LD x2,  XX,  5 * SIZE
+   MADD4  y4, a2, x1, y4
+   LD a2,  AO1,  5 * SIZE
+   MADD1  y1, a5, x3, y1
+   LD x1,  XX,  4 * SIZE
+   MADD2  y2, a5, x4, y2
+   LD a5,  AO1,  6 * SIZE
+   MADD3  y3, a6, x4, y3
+   LD x4,  XX,  7 * SIZE
+   MADD4  y4, a6, x3, y4
+   LD a6,  AO1,  7 * SIZE
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  6 * SIZE
+   MADD2  y2, a1, x2, y2
+   MADD3  y3, a2, x2, y3
+   MADD4  y4, a2, x1, y4
+   MADD1  y1, a5, x3, y1
+   MADD2  y2, a5, x4, y2
+   MADD3  y3, a6, x4, y3
+   addi.d  XX,  XX,   8 * SIZE
+   MADD4  y4, a6, x3, y4
+   addi.d  AO1, AO1,  8 * SIZE
+   .align 3
+
+.L25:
+   andi    I,  M, 2
+   bge $r0,    I, .L27
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  2 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a6,  AO1,  3 * SIZE
+   MADD3  y3, a2, x2, y3
+   LD x4,  XX,  3 * SIZE
+   MADD4  y4, a2, x1, y4
+   MADD1  y1, a5, x3, y1
+   MADD2  y2, a5, x4, y2
+   MADD3  y3, a6, x4, y3
+   addi.d  XX,  XX,   4 * SIZE
+   MADD4  y4, a6, x3, y4
+   addi.d  AO1, AO1,  4 * SIZE
+   .align 3
+
+.L27:
+   andi    I,  M, 1
+.align 3
+
+   bge $r0,    I, .L29
+.L28:
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD x2,  XX,  1 * SIZE
+   MADD1  y1, a1, x1, y1
+   MADD2  y2, a1, x2, y2
+   MADD3  y3, a2, x2, y3
+   MADD4  y4, a2, x1, y4
+   .align 3
+
+.L29:
+   LD a1,  Y,  0 * SIZE
+   LD a2,  Y,  1 * SIZE
+   ADD y1, y1, y3
+   ADD y2, y2, y4
+   MADD  a1, y1, ALPHA_R, a1
+   MADD  a2, y1, ALPHA_I, a2
+   NMSUB  a1, y2, ALPHA_I, a1
+   MADD  a2, y2, ALPHA_R, a2
+   ST a1,  YY,   0 * SIZE
+   ST a2,  YY,   1 * SIZE
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  16
+   fld.d  $f19,  $sp,  24
+#endif
+#ifdef __64BIT__
+   addi.d  $sp, $sp, 16
+#else
+   addi.d  $sp, $sp, 32
+#endif
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S
new file mode 100644
index 000000000..49f640268
--- /dev/null
+++ b/kernel/loongarch64/znrm2.S
@@ -0,0 +1,304 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r7
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+#define ALPHA  $f4
+#define max    $f5
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    INCX, .L999
+   move    XX, X
+   MOV s2, s1
+   srai.d  I, N, 2
+   MOV s3, s1
+   MOV    s4, s1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   NOP
+   FABS    t3, a3
+   LD a2,  X,   1 * SIZE
+   FABS    t4, a4
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, t1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   NOP
+   CMPLT   $fcc2, s3, t3
+   LD a4,  X,   1 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   FABS    t1, a5
+   LD a5,  X,   0 * SIZE
+   FABS    t2, a6
+   NOP
+   FABS    t3, a7
+   LD a6,  X,   1 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, t1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   NOP
+   CMPLT   $fcc2, s3, t3
+   LD a8,  X,   1 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   addi.d  I, I, -1
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   FABS    t1, a5
+   FABS    t2, a6
+   FABS    t3, a7
+   FABS    t4, a8
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L100
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   FABS    t2, a2
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L100:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   lu12i.w TEMP, 0x3f800
+   movgr2fr.d  a1,   $r0
+   movgr2fr.w  ALPHA, TEMP
+   CMPEQ   $fcc0, s1, a1
+   fcvt.d.s   ALPHA, ALPHA
+   bcnez   $fcc0, .L999
+   fdiv.d  ALPHA, ALPHA, s1
+   MOV max, s1
+   MOV s1, a1
+   MOV s2, a1
+   MOV s3, a1
+   MOV s4, a1
+   srai.d  I, N, 2
+   bge $r0,    I, .L105
+   LD a1,  XX,   0 * SIZE
+   LD a2,  XX,   1 * SIZE
+   add.d   XX, XX, INCX
+   LD a3,  XX,   0 * SIZE
+   LD a4,  XX,   1 * SIZE
+   add.d   XX, XX, INCX
+   LD a5,  XX,   0 * SIZE
+   LD a6,  XX,   1 * SIZE
+   add.d   XX, XX, INCX
+   LD a7,  XX,   0 * SIZE
+   LD a8,  XX,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  XX, XX, INCX
+   bge $r0,    I, .L104
+   .align 3
+
+.L103:
+   MUL t1, ALPHA, a1
+   LD a1,  XX,   0 * SIZE
+   MUL t2, ALPHA, a2
+   addi.d  I, I, -1
+   MUL t3, ALPHA, a3
+   LD a2,  XX,   1 * SIZE
+   MUL t4, ALPHA, a4
+   add.d   XX, XX, INCX
+   MADD  s1, t1, t1, s1
+   LD a3,  XX,   0 * SIZE
+   MADD  s2, t2, t2, s2
+   NOP
+   MADD  s3, t3, t3, s3
+   LD a4,  XX,   1 * SIZE
+   MADD  s4, t4, t4, s4
+   add.d   XX, XX, INCX
+   MUL t1, ALPHA, a5
+   LD a5,  XX,   0 * SIZE
+   MUL t2, ALPHA, a6
+   NOP
+   MUL t3, ALPHA, a7
+   LD a6,  XX,   1 * SIZE
+   MUL t4, ALPHA, a8
+   add.d   XX, XX, INCX
+   MADD  s1, t1, t1, s1
+   LD a7,  XX,   0 * SIZE
+   MADD  s2, t2, t2, s2
+   LD a8,  XX,   1 * SIZE
+   MADD  s3, t3, t3, s3
+   add.d   XX, XX, INCX
+   MADD  s4, t4, t4, s4
+   blt $r0,    I, .L103
+   .align 3
+
+.L104:
+   MUL t1, ALPHA, a1
+   MUL t2, ALPHA, a2
+   MUL t3, ALPHA, a3
+   MUL t4, ALPHA, a4
+   MADD  s1, t1, t1, s1
+   MADD  s2, t2, t2, s2
+   MADD  s3, t3, t3, s3
+   MADD  s4, t4, t4, s4
+   MUL t1, ALPHA, a5
+   MUL t2, ALPHA, a6
+   MUL t3, ALPHA, a7
+   MUL t4, ALPHA, a8
+   MADD  s1, t1, t1, s1
+   MADD  s2, t2, t2, s2
+   MADD  s3, t3, t3, s3
+   MADD  s4, t4, t4, s4
+   .align 3
+
+.L105:
+   andi    I,  N, 3
+   bge $r0,    I, .L998
+   .align  3
+
+.L106:
+   LD a1,  XX,   0 * SIZE
+   LD a2,  XX,   1 * SIZE
+   addi.d  I, I, -1
+   MUL t1, ALPHA, a1
+   MUL t2, ALPHA, a2
+   MADD  s1, t1, t1, s1
+   add.d   XX, XX, INCX
+   MADD  s2, t2, t2, s2
+   blt $r0,    I, .L106
+   .align 3
+
+.L998:
+   ADD s1, s1, s2
+   ADD s3, s3, s4
+   ADD s1, s1, s3
+   fsqrt.d s1, s1
+   move $r4, $r17
+   MUL $f0, max, s1
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S
new file mode 100644
index 000000000..fe53ed713
--- /dev/null
+++ b/kernel/loongarch64/zscal.S
@@ -0,0 +1,356 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N          $r4
+#define X          $r7
+#define INCX       $r8
+#define I          $r17
+#define TEMP       $r18
+#define XX         $r5
+#define ALPHA_R    $f0
+#define ALPHA_I    $f1
+#define a1         $f22
+#define a2         $f8
+#define a3         $f23
+#define a4         $f9
+#define a5         $f10
+#define a6         $f11
+#define a7         $f12
+#define a8         $f13
+#define t1         $f14
+#define t2         $f15
+#define t3         $f16
+#define t4         $f17
+
+   PROLOGUE
+
+   li  TEMP, 2 * SIZE
+   MTC  a1, $r0
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    N, .L999
+   CMPEQ   $fcc0, ALPHA_R, a1
+   CMPEQ   $fcc1, ALPHA_I, a1
+   bceqz   $fcc0, .L50
+   bceqz   $fcc1, .L50
+   srai.d I, N, 2
+   bne INCX, TEMP, .L20
+   bge $r0,    I, .L15
+   .align 3
+
+.L12:
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   ST a1,  X,   2 * SIZE
+   ST a1,  X,   3 * SIZE
+   ST a1,  X,   4 * SIZE
+   ST a1,  X,   5 * SIZE
+   ST a1,  X,   6 * SIZE
+   ST a1,  X,   7 * SIZE
+   addi.w  I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+.L16:
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   addi.d  I, I, -1
+   addi.d X, X, 2 * SIZE
+   blt $r0,    I, .L16
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L20:
+   srai.d  I, N, 2
+   bge $r0,    I, .L25
+   .align 3
+
+.L22:
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L22
+   .align 3
+
+.L25:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+.L26:
+   ST a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   ST a1,  X,   1 * SIZE
+   add.d  X, X, INCX
+   blt $r0,    I, .L26
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L50:
+   srai.d I, N, 2
+   bne INCX, TEMP, .L60
+   addi.d I, I, -1
+   blt I,  $r0, .L55
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   LD a6,  X,   5 * SIZE
+   MUL t1, ALPHA_R, a1
+   LD a7,  X,   6 * SIZE
+   MUL t2, ALPHA_I, a1
+   LD a8,  X,   7 * SIZE
+   MUL t3, ALPHA_R, a3
+   MUL    t4, ALPHA_I, a3
+   bge $r0,    I, .L53
+   .align 3
+
+.L52:
+   NMSUB  t1, a2, ALPHA_I, t1
+   LD a1,  X,   8 * SIZE
+   MADD  t2, a2, ALPHA_R, t2
+   LD a2,  X,   9 * SIZE
+   NMSUB  t3, a4, ALPHA_I, t3
+   LD a3,  X,  10 * SIZE
+   MADD  t4, a4, ALPHA_R, t4
+   LD a4,  X,  11 * SIZE
+   ST t1,  X,   0 * SIZE
+   MUL t1, ALPHA_R, a5
+   ST t2,  X,   1 * SIZE
+   MUL t2, ALPHA_I, a5
+   ST t3,  X,   2 * SIZE
+   MUL t3, ALPHA_R, a7
+   ST t4,  X,   3 * SIZE
+   MUL t4, ALPHA_I, a7
+   NMSUB  t1, a6, ALPHA_I, t1
+   LD a5,  X,  12 * SIZE
+   MADD  t2, a6, ALPHA_R, t2
+   LD a6,  X,  13 * SIZE
+   NMSUB  t3, a8, ALPHA_I, t3
+   LD a7,  X,  14 * SIZE
+   MADD  t4, a8, ALPHA_R, t4
+   LD a8,  X,  15 * SIZE
+   ST t1,  X,   4 * SIZE
+   MUL t1, ALPHA_R, a1
+   ST t2,  X,   5 * SIZE
+   MUL t2, ALPHA_I, a1
+   ST t3,  X,   6 * SIZE
+   MUL t3, ALPHA_R, a3
+   ST t4,  X,   7 * SIZE
+   MUL t4, ALPHA_I, a3
+   addi.d  I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0,    I, .L52
+   .align 3
+
+.L53:
+   NMSUB  t1, a2, ALPHA_I, t1
+   MADD  t2, a2, ALPHA_R, t2
+   NMSUB  t3, a4, ALPHA_I, t3
+   MADD  t4, a4, ALPHA_R, t4
+   ST t1,  X,   0 * SIZE
+   MUL t1, ALPHA_R, a5
+   ST t2,  X,   1 * SIZE
+   MUL t2, ALPHA_I, a5
+   ST t3,  X,   2 * SIZE
+   MUL t3, ALPHA_R, a7
+   ST t4,  X,   3 * SIZE
+   MUL t4, ALPHA_I, a7
+   NMSUB  t1, a6, ALPHA_I, t1
+   MADD  t2, a6, ALPHA_R, t2
+   NMSUB  t3, a8, ALPHA_I, t3
+   MADD  t4, a8, ALPHA_R, t4
+   ST t1,  X,   4 * SIZE
+   ST t2,  X,   5 * SIZE
+   ST t3,  X,   6 * SIZE
+   ST t4,  X,   7 * SIZE
+   addi.d  X, X, 8 * SIZE
+   .align 3
+
+.L55:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+.L56:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   MUL t1, ALPHA_R, a1
+   MUL t2, ALPHA_I, a1
+   NMSUB  t1, a2, ALPHA_I, t1
+   MADD  t2, a2, ALPHA_R, t2
+   addi.d  X, X, 2 * SIZE
+   addi.d  I, I, -1
+   ST t1,  X,  -2 * SIZE
+   ST t2,  X,  -1 * SIZE
+   blt $r0,    I, .L56
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L60:
+   srai.d  I, N, 2
+   move    XX, X
+   addi.d I, I, -1
+   blt I,  $r0, .L65
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   MUL t1, ALPHA_R, a1
+   LD a7,  X,   0 * SIZE
+   MUL t2, ALPHA_I, a1
+   LD a8,  X,   1 * SIZE
+   MUL t3, ALPHA_R, a3
+   add.d   X, X, INCX
+   MUL    t4, ALPHA_I, a3
+   bge $r0,    I, .L63
+   .align 3
+
+.L62:
+   NMSUB  t1, a2, ALPHA_I, t1
+   LD a1,  X,   0 * SIZE
+   MADD  t2, a2, ALPHA_R, t2
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   NMSUB  t3, a4, ALPHA_I, t3
+   LD a3,  X,   0 * SIZE
+   MADD  t4, a4, ALPHA_R, t4
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST t1,  XX,   0 * SIZE
+   MUL t1, ALPHA_R, a5
+   ST t2,  XX,   1 * SIZE
+   MUL t2, ALPHA_I, a5
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   MUL t3, ALPHA_R, a7
+   ST t4,  XX,   1 * SIZE
+   MUL t4, ALPHA_I, a7
+   add.d   XX, XX, INCX
+   NMSUB  t1, a6, ALPHA_I, t1
+   LD a5,  X,   0 * SIZE
+   MADD  t2, a6, ALPHA_R, t2
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   NMSUB  t3, a8, ALPHA_I, t3
+   LD a7,  X,   0 * SIZE
+   MADD  t4, a8, ALPHA_R, t4
+   LD a8,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST t1,  XX,   0 * SIZE
+   MUL t1, ALPHA_R, a1
+   ST t2,  XX,   1 * SIZE
+   MUL t2, ALPHA_I, a1
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   MUL t3, ALPHA_R, a3
+   ST t4,  XX,   1 * SIZE
+   MUL t4, ALPHA_I, a3
+   addi.d  I, I, -1
+   add.d  XX, XX, INCX
+   blt $r0,    I, .L62
+   .align 3
+
+.L63:
+   NMSUB  t1, a2, ALPHA_I, t1
+   MADD  t2, a2, ALPHA_R, t2
+   NMSUB  t3, a4, ALPHA_I, t3
+   MADD  t4, a4, ALPHA_R, t4
+   ST t1,  XX,   0 * SIZE
+   MUL t1, ALPHA_R, a5
+   ST t2,  XX,   1 * SIZE
+   MUL t2, ALPHA_I, a5
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   MUL t3, ALPHA_R, a7
+   ST t4,  XX,   1 * SIZE
+   MUL t4, ALPHA_I, a7
+   add.d   XX, XX, INCX
+   NMSUB  t1, a6, ALPHA_I, t1
+   MADD  t2, a6, ALPHA_R, t2
+   NMSUB  t3, a8, ALPHA_I, t3
+   MADD  t4, a8, ALPHA_R, t4
+   ST t1,  XX,   0 * SIZE
+   ST t2,  XX,   1 * SIZE
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   ST t4,  XX,   1 * SIZE
+   add.d   XX, XX, INCX
+   .align 3
+
+.L65:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+.L66:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   MUL t1, ALPHA_R, a1
+   MUL t2, ALPHA_I, a1
+   NMSUB  t1, a2, ALPHA_I, t1
+   MADD  t2, a2, ALPHA_R, t2
+   addi.d  I, I, -1
+   ST t1,  X,   0 * SIZE
+   ST t2,  X,   1 * SIZE
+   add.d  X, X, INCX
+   blt $r0,    I, .L66
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/ztrsm_kernel_LT.S b/kernel/loongarch64/ztrsm_kernel_LT.S
new file mode 100644
index 000000000..26b1230b8
--- /dev/null
+++ b/kernel/loongarch64/ztrsm_kernel_LT.S
@@ -0,0 +1,1344 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+#define OFFSET $r11
+
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r25
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define KK     $r26
+#define TEMP   $r27
+#define AORIG  $r28
+#define a1     $f22
+#define a2     $f8
+#define a3     $f26
+#define a4     $f27
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f0
+#define c22    $f1
+#define c31    $f2
+#define c32    $f3
+#define c41    $f4
+#define c42    $f5
+#define c51    $f6
+#define c52    $f7
+#define c61    $f18
+#define c62    $f19
+#define c71    $f20
+#define c72    $f21
+#define c81    $f24
+#define c82    $f25
+
+#ifndef CONJ
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#define    MADD5       MSUB
+#define    MADD6       MADD
+#define    MADD7       NMSUB
+#define    MADD8       MADD
+#else
+#if defined(LN) || defined(LT)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#else
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+#define    MADD5       MADD
+#define    MADD6       MSUB
+#define    MADD7       MADD
+#define    MADD8       NMSUB
+#endif
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -128
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   fst.d  $f24,  $sp,  48
+   fst.d  $f25,  $sp,  56
+   fst.d  $f26,  $sp,  64
+   fst.d  $f27,  $sp,  72
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  88
+   fst.d  $f19,  $sp,  96
+   fst.d  $f20,  $sp, 104
+   fst.d  $f21,  $sp, 112
+#endif
+   slli.d     LDC,    LDC, ZBASE_SHIFT
+#ifdef LN
+        mul.w   TEMP, M, K
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   A, A, TEMP
+   slli.d  TEMP, M, ZBASE_SHIFT
+   add.d   C, C, TEMP
+#endif
+#ifdef RN
+        sub.d   KK, $r0, OFFSET
+#endif
+#ifdef RT
+        mul.w   TEMP, N, K
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   B, B, TEMP
+        mul.w   TEMP, N,    LDC
+   add.d   C, C, TEMP
+   sub.d   KK, N, OFFSET
+#endif
+   srai.d  J,  N, 2
+nop
+   bge $r0,    J, .L20
+.L10:
+#ifdef RT
+   slli.d  TEMP, K, 2 + ZBASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 2
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   MOV c31, c11
+   MOV c41, c11
+   MOV c51, c11
+   move    I,  M
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO4,    LDC
+#endif
+MOV    c61, c11
+   bge $r0,    I, .L19
+   .align 3
+
+.L11:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  KK, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  ZBASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, ZBASE_SHIFT
+   slli.d  TEMP, KK, 2 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  TEMP, 2
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+   bge $r0,    L, .L15
+#endif
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   MADD3  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD1  c71, b3, a1, c71
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c51, b7, a4, c51
+   MADD3  c61, b2, a4, c61
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD1  c51, b5, a3, c51
+   MADD3  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD1  c71, b3, a3, c71
+   MADD3  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD1  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD3  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   MADD3  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD1  c71, b3, a1, c71
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c51, b7, a4, c51
+   MADD3  c61, b2, a4, c61
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD1  c51, b5, a3, c51
+   MADD3  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD1  c71, b3, a3, c71
+   MADD3  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD1  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD3  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD3  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD1  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+   ADD c31, c31, c42
+   ADD c32, c32, c41
+   ADD c51, c51, c62
+   ADD c52, c52, c61
+   ADD c71, c71, c82
+   ADD c72, c72, c81
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+   SUB c51, b5, c51
+   SUB c52, b6, c52
+   SUB c71, b7, c71
+   SUB c72, b8, c72
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+   SUB c51, b5, c51
+   SUB c52, b6, c52
+   SUB c71, b7, c71
+   SUB c72, b8, c72
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MUL a3, b2, c32
+   MUL a4, b2, c31
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   MADD5  c31, c31, b1, a3
+   MADD6  c32, c32, b1, a4
+   MUL a1, b2, c52
+   MUL a2, b2, c51
+   MUL a3, b2, c72
+   MUL a4, b2, c71
+   MADD5  c51, c51, b1, a1
+   MADD6  c52, c52, b1, a2
+   MADD5  c71, c71, b1, a3
+   MADD6  c72, c72, b1, a4
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   NMSUB  c31, c11, b3, c31
+   MADD7  c32, c11, b4, c32
+   NMSUB  c51, c11, b5, c51
+   MADD7  c52, c11, b6, c52
+   NMSUB  c71, c11, b7, c71
+   MADD7  c72, c11, b8, c72
+   MADD8  c31, c12, b4, c31
+   NMSUB  c32, c12, b3, c32
+   MADD8  c51, c12, b6, c51
+   NMSUB  c52, c12, b5, c52
+   MADD8  c71, c12, b8, c71
+   NMSUB  c72, c12, b7, c72
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   LD b5,  BO,  12 * SIZE
+   LD b6,  BO,  13 * SIZE
+   LD b7,  BO,  14 * SIZE
+   LD b8,  BO,  15 * SIZE
+   MUL a1, b4, c32
+   MUL a2, b4, c31
+   MADD5  c31, c31, b3, a1
+   MADD6  c32, c32, b3, a2
+   NMSUB  c51, c31, b5, c51
+   MADD7  c52, c31, b6, c52
+   NMSUB  c71, c31, b7, c71
+   MADD7  c72, c31, b8, c72
+   MADD8  c51, c32, b6, c51
+   NMSUB  c52, c32, b5, c52
+   MADD8  c71, c32, b8, c71
+   NMSUB  c72, c32, b7, c72
+   LD b5,  BO,  20 * SIZE
+   LD b6,  BO,  21 * SIZE
+   LD b7,  BO,  22 * SIZE
+   LD b8,  BO,  23 * SIZE
+   MUL a1, b6, c52
+   MUL a2, b6, c51
+   MADD5  c51, c51, b5, a1
+   MADD6  c52, c52, b5, a2
+   NMSUB  c71, c51, b7, c71
+   MADD7  c72, c51, b8, c72
+   MADD8  c71, c52, b8, c71
+   NMSUB  c72, c52, b7, c72
+   LD b7,  BO,  30 * SIZE
+   LD b8,  BO,  31 * SIZE
+   MUL a1, b8, c72
+   MUL a2, b8, c71
+   MADD5  c71, c71, b7, a1
+   MADD6  c72, c72, b7, a2
+#endif
+#ifdef RT
+   LD b1,  BO,  30 * SIZE
+   LD b2,  BO,  31 * SIZE
+   LD b3,  BO,  28 * SIZE
+   LD b4,  BO,  29 * SIZE
+   LD b5,  BO,  26 * SIZE
+   LD b6,  BO,  27 * SIZE
+   LD b7,  BO,  24 * SIZE
+   LD b8,  BO,  25 * SIZE
+   MUL a1, b2, c72
+   MUL a2, b2, c71
+   MADD5  c71, c71, b1, a1
+   MADD6  c72, c72, b1, a2
+   NMSUB  c51, c71, b3, c51
+   MADD7  c52, c71, b4, c52
+   NMSUB  c31, c71, b5, c31
+   MADD7  c32, c71, b6, c32
+   NMSUB  c11, c71, b7, c11
+   MADD7  c12, c71, b8, c12
+   MADD8  c51, c72, b4, c51
+   NMSUB  c52, c72, b3, c52
+   MADD8  c31, c72, b6, c31
+   NMSUB  c32, c72, b5, c32
+   MADD8  c11, c72, b8, c11
+   NMSUB  c12, c72, b7, c12
+   LD b3,  BO,  20 * SIZE
+   LD b4,  BO,  21 * SIZE
+   LD b5,  BO,  18 * SIZE
+   LD b6,  BO,  19 * SIZE
+   LD b7,  BO,  16 * SIZE
+   LD b8,  BO,  17 * SIZE
+   MUL a1, b4, c52
+   MUL a2, b4, c51
+   MADD5  c51, c51, b3, a1
+   MADD6  c52, c52, b3, a2
+   NMSUB  c31, c51, b5, c31
+   MADD7  c32, c51, b6, c32
+   NMSUB  c11, c51, b7, c11
+   MADD7  c12, c51, b8, c12
+   MADD8  c31, c52, b6, c31
+   NMSUB  c32, c52, b5, c32
+   MADD8  c11, c52, b8, c11
+   NMSUB  c12, c52, b7, c12
+   LD b5,  BO,  10 * SIZE
+   LD b6,  BO,  11 * SIZE
+   LD b7,  BO,   8 * SIZE
+   LD b8,  BO,   9 * SIZE
+   MUL a1, b6, c32
+   MUL a2, b6, c31
+   MADD5  c31, c31, b5, a1
+   MADD6  c32, c32, b5, a2
+   NMSUB  c11, c31, b7, c11
+   MADD7  c12, c31, b8, c12
+   MADD8  c11, c32, b8, c11
+   NMSUB  c12, c32, b7, c12
+   LD b7,  BO,   0 * SIZE
+   LD b8,  BO,   1 * SIZE
+   MUL a1, b8, c12
+   MUL a2, b8, c11
+   MADD5  c11, c11, b7, a1
+   MADD6  c12, c12, b7, a2
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c32,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c52,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c72,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c32,  AO,   3 * SIZE
+   ST c51,  AO,   4 * SIZE
+   ST c52,  AO,   5 * SIZE
+   ST c71,  AO,   6 * SIZE
+   ST c72,  AO,   7 * SIZE
+#endif
+#ifdef LN
+   addi.d  CO1,CO1, -2 * SIZE
+   addi.d  CO2,CO2, -2 * SIZE
+   addi.d  CO3,CO3, -2 * SIZE
+   addi.d  CO4,CO4, -2 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c31,  CO2,   0 * SIZE
+   ST c32,  CO2,   1 * SIZE
+   ST c51,  CO3,   0 * SIZE
+   ST c52,  CO3,   1 * SIZE
+   ST c71,  CO4,   0 * SIZE
+   ST c72,  CO4,   1 * SIZE
+#ifndef LN
+   addi.d  CO1,CO1, 2 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+   addi.d  CO3,CO3, 2 * SIZE
+   addi.d  CO4,CO4, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+MTC  c11, $r0
+   addi.d  I, I, -1
+   MOV c21, c11
+   MOV c31, c11
+   MOV c41, c11
+   MOV c51, c11
+MOV    c61, c11
+   blt $r0,    I, .L11
+   .align 3
+
+.L19:
+#ifdef LN
+   slli.d  TEMP, K, 2 + ZBASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  4
+#endif
+#ifdef RT
+   addi.d  KK, KK, -4
+#endif
+   blt $r0,    J, .L10
+   .align 3
+
+.L20:
+   andi    J,  N, 2
+   bge $r0,    J, .L30
+#ifdef RT
+   slli.d  TEMP, K, 1 + ZBASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 1
+   sub.d   C, C, TEMP
+#endif
+MTC  c11, $r0
+   move    CO1, C
+   add.d   CO2, C,      LDC
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO2,    LDC
+#endif
+   move    I,  M
+   bge $r0,    I, .L29
+   .align 3
+
+.L21:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  B,   0 * SIZE
+   MOV c31, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c41, c11
+   LD b2,  B,   1 * SIZE
+   srai.d  L,  KK, 2
+   LD b3,  B,   2 * SIZE
+   MOV c12, c11
+   LD b4,  B,   3 * SIZE
+   MOV c22, c11
+   LD b5,  B,   4 * SIZE
+   MOV c32, c11
+   MOV c42, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  ZBASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, ZBASE_SHIFT
+   slli.d  TEMP, KK, 1 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c31, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c41, c11
+   LD b2,  BO,   1 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b3,  BO,   2 * SIZE
+   MOV c12, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c22, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c32, c11
+MOV    c42, c11
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD2  c12, b5, a2, c12
+   LD b5,  BO,  12 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c11, b5, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD2  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  17 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  18 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  19 * SIZE
+addi.d BO, BO, 16 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   addi.d  BO, BO,  4 * SIZE
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   0 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+addi.d AO, AO,  2 * SIZE
+   blt $r0,    L, .L26
+.L28:
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+   ADD c31, c31, c42
+   ADD c32, c32, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MUL a3, b2, c32
+   MUL a4, b2, c31
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   MADD5  c31, c31, b1, a3
+   MADD6  c32, c32, b1, a4
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   NMSUB  c31, c11, b3, c31
+   MADD7  c32, c11, b4, c32
+   MADD8  c31, c12, b4, c31
+   NMSUB  c32, c12, b3, c32
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL a1, b4, c32
+   MUL a2, b4, c31
+   MADD5  c31, c31, b3, a1
+   MADD6  c32, c32, b3, a2
+#endif
+#ifdef RT
+   LD b5,  BO,   6 * SIZE
+   LD b6,  BO,   7 * SIZE
+   LD b7,  BO,   4 * SIZE
+   LD b8,  BO,   5 * SIZE
+   MUL a1, b6, c32
+   MUL a2, b6, c31
+   MADD5  c31, c31, b5, a1
+   MADD6  c32, c32, b5, a2
+   NMSUB  c11, c31, b7, c11
+   MADD7  c12, c31, b8, c12
+   MADD8  c11, c32, b8, c11
+   NMSUB  c12, c32, b7, c12
+   LD b7,  BO,   0 * SIZE
+   LD b8,  BO,   1 * SIZE
+   MUL a1, b8, c12
+   MUL a2, b8, c11
+   MADD5  c11, c11, b7, a1
+   MADD6  c12, c12, b7, a2
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c32,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c32,  AO,   3 * SIZE
+#endif
+#ifdef LN
+   addi.d  CO1,CO1, -2 * SIZE
+   addi.d  CO2,CO2, -2 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c31,  CO2,   0 * SIZE
+   ST c32,  CO2,   1 * SIZE
+#ifndef LN
+   addi.d  CO1,CO1, 2 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+#endif
+MTC  c11, $r0
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L21
+   .align 3
+
+.L29:
+#ifdef LN
+   slli.d  TEMP, K, 1 + ZBASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  2
+#endif
+#ifdef RT
+   addi.d  KK, KK, -2
+#endif
+   .align 3
+
+.L30:
+   andi    J,  N, 1
+   bge $r0,    J, .L999
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   sub.d   B, B, TEMP
+   sub.d   C, C,    LDC
+#endif
+MTC  c11, $r0
+   move    CO1, C
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO1,    LDC
+#endif
+   move    I,  M
+   bge $r0,    I, .L39
+   .align 3
+
+.L31:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  B,   0 * SIZE
+   MOV c31, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c41, c11
+   LD b2,  B,   1 * SIZE
+   MOV c12, c11
+   srai.d  L,  KK, 2
+   MOV c22, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c32, c11
+   LD b3,  B,   4 * SIZE
+   MOV c42, c11
+move   BO,  B
+   bge $r0,    L, .L35
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  ZBASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  TEMP, KK, ZBASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c31, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c41, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c12, c11
+   srai.d  L, TEMP, 2
+   MOV c22, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c32, c11
+   LD b3,  BO,   4 * SIZE
+MOV    c42, c11
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD1  c11, b1, a1, c11
+   LD b4,  BO,   3 * SIZE
+   MADD3  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   3 * SIZE
+   MADD1  c11, b1, a1, c11
+   LD b2,  BO,   5 * SIZE
+   MADD3  c21, b4, a1, c21
+   LD a1,  AO,   8 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b4, a2, c22
+   LD a2,  AO,   5 * SIZE
+   MADD1  c11, b3, a3, c11
+   LD b4,  BO,   7 * SIZE
+   MADD3  c21, b2, a3, c21
+   LD a3,  AO,   6 * SIZE
+   MADD2  c12, b3, a2, c12
+   LD b3,  BO,   6 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   7 * SIZE
+   MADD1  c11, b3, a3, c11
+   LD b2,  BO,   9 * SIZE
+   MADD3  c21, b4, a3, c21
+   LD a3,  AO,  12 * SIZE
+   MADD2  c12, b3, a2, c12
+   LD b3,  BO,  12 * SIZE
+   MADD4  c22, b4, a2, c22
+   LD a2,  AO,   9 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD1  c11, b1, a1, c11
+   addi.d  L, L, -1
+   MADD3  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   3 * SIZE
+   LD b2,  BO,   3 * SIZE
+   addi.d  BO, BO,  2 * SIZE
+addi.d AO, AO,  2 * SIZE
+   blt $r0,    L, .L36
+.L38:
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+#if defined(LN) || defined(RT)
+   addi.d  TEMP, KK, -1
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+#endif
+#ifdef LN
+   addi.d  CO1,CO1, -2 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+#ifndef LN
+   addi.d  CO1,CO1, 2 * SIZE
+#endif
+MTC  c11, $r0
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L31
+   .align 3
+
+.L39:
+#ifdef LN
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  1
+#endif
+#ifdef RT
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   fld.d  $f24,  $sp,  48
+   fld.d  $f25,  $sp,  56
+   fld.d  $f26,  $sp,  64
+   fld.d  $f27,  $sp,  72
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  88
+   fld.d  $f19,  $sp,  96
+   fld.d  $f20,  $sp, 104
+   fld.d  $f21,  $sp, 112
+#endif
+   addi.d  $sp, $sp, 128
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/ztrsm_kernel_RT.S b/kernel/loongarch64/ztrsm_kernel_RT.S
new file mode 100644
index 000000000..e9f04362d
--- /dev/null
+++ b/kernel/loongarch64/ztrsm_kernel_RT.S
@@ -0,0 +1,1343 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+#define OFFSET $r11
+
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r25
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define KK     $r26
+#define TEMP   $r27
+#define AORIG  $r28
+#define a1     $f22
+#define a2     $f8
+#define a3     $f26
+#define a4     $f27
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f0
+#define c22    $f1
+#define c31    $f2
+#define c32    $f3
+#define c41    $f4
+#define c42    $f5
+#define c51    $f6
+#define c52    $f7
+#define c61    $f18
+#define c62    $f19
+#define c71    $f20
+#define c72    $f21
+#define c81    $f24
+#define c82    $f25
+
+#ifndef CONJ
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#define    MADD5       MSUB
+#define    MADD6       MADD
+#define    MADD7       NMSUB
+#define    MADD8       MADD
+#else
+#if defined(LN) || defined(LT)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#else
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+#define    MADD5       MADD
+#define    MADD6       MSUB
+#define    MADD7       MADD
+#define    MADD8       NMSUB
+#endif
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -128
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   fst.d  $f24,  $sp,  48
+   fst.d  $f25,  $sp,  56
+   fst.d  $f26,  $sp,  64
+   fst.d  $f27,  $sp,  72
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  88
+   fst.d  $f19,  $sp,  96
+   fst.d  $f20,  $sp, 104
+   fst.d  $f21,  $sp, 112
+#endif
+   slli.d     LDC,    LDC, ZBASE_SHIFT
+#ifdef LN
+        mul.w   TEMP, M, K
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   A, A, TEMP
+   slli.d  TEMP, M, ZBASE_SHIFT
+   add.d   C, C, TEMP
+#endif
+#ifdef RN
+        sub.d   KK, $r0, OFFSET
+#endif
+#ifdef RT
+        mul.w   TEMP, N, K
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   B, B, TEMP
+        mul.w   TEMP, N,    LDC
+   add.d   C, C, TEMP
+   sub.d   KK, N, OFFSET
+#endif
+   andi    J,  N, 1
+   bge $r0,    J, .L20
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   sub.d   B, B, TEMP
+   sub.d   C, C,    LDC
+#endif
+MTC  c11, $r0
+   move    CO1, C
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO1,    LDC
+#endif
+   move    I,  M
+   bge $r0,    I, .L39
+   .align 3
+
+.L31:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  B,   0 * SIZE
+   MOV c31, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c41, c11
+   LD b2,  B,   1 * SIZE
+   MOV c12, c11
+   srai.d  L,  KK, 2
+   MOV c22, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c32, c11
+   LD b3,  B,   4 * SIZE
+   MOV c42, c11
+move   BO,  B
+   bge $r0,    L, .L35
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  ZBASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  TEMP, KK, ZBASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c31, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c41, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c12, c11
+   srai.d  L, TEMP, 2
+   MOV c22, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c32, c11
+   LD b3,  BO,   4 * SIZE
+MOV    c42, c11
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD1  c11, b1, a1, c11
+   LD b4,  BO,   3 * SIZE
+   MADD3  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   3 * SIZE
+   MADD1  c11, b1, a1, c11
+   LD b2,  BO,   5 * SIZE
+   MADD3  c21, b4, a1, c21
+   LD a1,  AO,   8 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b4, a2, c22
+   LD a2,  AO,   5 * SIZE
+   MADD1  c11, b3, a3, c11
+   LD b4,  BO,   7 * SIZE
+   MADD3  c21, b2, a3, c21
+   LD a3,  AO,   6 * SIZE
+   MADD2  c12, b3, a2, c12
+   LD b3,  BO,   6 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   7 * SIZE
+   MADD1  c11, b3, a3, c11
+   LD b2,  BO,   9 * SIZE
+   MADD3  c21, b4, a3, c21
+   LD a3,  AO,  12 * SIZE
+   MADD2  c12, b3, a2, c12
+   LD b3,  BO,  12 * SIZE
+   MADD4  c22, b4, a2, c22
+   LD a2,  AO,   9 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD1  c11, b1, a1, c11
+   addi.d  L, L, -1
+   MADD3  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   3 * SIZE
+   LD b2,  BO,   3 * SIZE
+   addi.d  BO, BO,  2 * SIZE
+addi.d AO, AO,  2 * SIZE
+   blt $r0,    L, .L36
+.L38:
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+#if defined(LN) || defined(RT)
+   addi.d  TEMP, KK, -1
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+#endif
+#ifdef LN
+   addi.d  CO1,CO1, -2 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+#ifndef LN
+   addi.d  CO1,CO1, 2 * SIZE
+#endif
+MTC  c11, $r0
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L31
+   .align 3
+
+.L39:
+#ifdef LN
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  1
+#endif
+#ifdef RT
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L20:
+   andi    J,  N, 2
+   bge $r0,    J, .L30
+#ifdef RT
+   slli.d  TEMP, K, 1 + ZBASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 1
+   sub.d   C, C, TEMP
+#endif
+MTC  c11, $r0
+   move    CO1, C
+   add.d   CO2, C,      LDC
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO2,    LDC
+#endif
+   move    I,  M
+   bge $r0,    I, .L29
+   .align 3
+
+.L21:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  B,   0 * SIZE
+   MOV c31, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c41, c11
+   LD b2,  B,   1 * SIZE
+   srai.d  L,  KK, 2
+   LD b3,  B,   2 * SIZE
+   MOV c12, c11
+   LD b4,  B,   3 * SIZE
+   MOV c22, c11
+   LD b5,  B,   4 * SIZE
+   MOV c32, c11
+   MOV c42, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  ZBASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, ZBASE_SHIFT
+   slli.d  TEMP, KK, 1 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c31, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c41, c11
+   LD b2,  BO,   1 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b3,  BO,   2 * SIZE
+   MOV c12, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c22, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c32, c11
+MOV    c42, c11
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD2  c12, b5, a2, c12
+   LD b5,  BO,  12 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c11, b5, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD2  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  17 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  18 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  19 * SIZE
+addi.d BO, BO, 16 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   addi.d  BO, BO,  4 * SIZE
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   0 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+addi.d AO, AO,  2 * SIZE
+   blt $r0,    L, .L26
+.L28:
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+   ADD c31, c31, c42
+   ADD c32, c32, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MUL a3, b2, c32
+   MUL a4, b2, c31
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   MADD5  c31, c31, b1, a3
+   MADD6  c32, c32, b1, a4
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   NMSUB  c31, c11, b3, c31
+   MADD7  c32, c11, b4, c32
+   MADD8  c31, c12, b4, c31
+   NMSUB  c32, c12, b3, c32
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL a1, b4, c32
+   MUL a2, b4, c31
+   MADD5  c31, c31, b3, a1
+   MADD6  c32, c32, b3, a2
+#endif
+#ifdef RT
+   LD b5,  BO,   6 * SIZE
+   LD b6,  BO,   7 * SIZE
+   LD b7,  BO,   4 * SIZE
+   LD b8,  BO,   5 * SIZE
+   MUL a1, b6, c32
+   MUL a2, b6, c31
+   MADD5  c31, c31, b5, a1
+   MADD6  c32, c32, b5, a2
+   NMSUB  c11, c31, b7, c11
+   MADD7  c12, c31, b8, c12
+   MADD8  c11, c32, b8, c11
+   NMSUB  c12, c32, b7, c12
+   LD b7,  BO,   0 * SIZE
+   LD b8,  BO,   1 * SIZE
+   MUL a1, b8, c12
+   MUL a2, b8, c11
+   MADD5  c11, c11, b7, a1
+   MADD6  c12, c12, b7, a2
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c32,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c32,  AO,   3 * SIZE
+#endif
+#ifdef LN
+   addi.d  CO1,CO1, -2 * SIZE
+   addi.d  CO2,CO2, -2 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c31,  CO2,   0 * SIZE
+   ST c32,  CO2,   1 * SIZE
+#ifndef LN
+   addi.d  CO1,CO1, 2 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+#endif
+MTC  c11, $r0
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L21
+   .align 3
+
+.L29:
+#ifdef LN
+   slli.d  TEMP, K, 1 + ZBASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  2
+#endif
+#ifdef RT
+   addi.d  KK, KK, -2
+#endif
+   .align 3
+
+.L30:
+   srai.d  J,  N, 2
+nop
+   bge $r0,    J, .L999
+.L10:
+#ifdef RT
+   slli.d  TEMP, K, 2 + ZBASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 2
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   MOV c31, c11
+   MOV c41, c11
+   MOV c51, c11
+   move    I,  M
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO4,    LDC
+#endif
+MOV    c61, c11
+   bge $r0,    I, .L19
+   .align 3
+
+.L11:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  KK, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  ZBASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, ZBASE_SHIFT
+   slli.d  TEMP, KK, 2 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  TEMP, 2
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+   bge $r0,    L, .L15
+#endif
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   MADD3  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD1  c71, b3, a1, c71
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c51, b7, a4, c51
+   MADD3  c61, b2, a4, c61
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD1  c51, b5, a3, c51
+   MADD3  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD1  c71, b3, a3, c71
+   MADD3  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD1  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD3  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   MADD3  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD1  c71, b3, a1, c71
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c51, b7, a4, c51
+   MADD3  c61, b2, a4, c61
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD1  c51, b5, a3, c51
+   MADD3  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD1  c71, b3, a3, c71
+   MADD3  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD1  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD3  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD3  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD1  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+   ADD c31, c31, c42
+   ADD c32, c32, c41
+   ADD c51, c51, c62
+   ADD c52, c52, c61
+   ADD c71, c71, c82
+   ADD c72, c72, c81
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+   SUB c51, b5, c51
+   SUB c52, b6, c52
+   SUB c71, b7, c71
+   SUB c72, b8, c72
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+   SUB c51, b5, c51
+   SUB c52, b6, c52
+   SUB c71, b7, c71
+   SUB c72, b8, c72
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MUL a3, b2, c32
+   MUL a4, b2, c31
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   MADD5  c31, c31, b1, a3
+   MADD6  c32, c32, b1, a4
+   MUL a1, b2, c52
+   MUL a2, b2, c51
+   MUL a3, b2, c72
+   MUL a4, b2, c71
+   MADD5  c51, c51, b1, a1
+   MADD6  c52, c52, b1, a2
+   MADD5  c71, c71, b1, a3
+   MADD6  c72, c72, b1, a4
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   NMSUB  c31, c11, b3, c31
+   MADD7  c32, c11, b4, c32
+   NMSUB  c51, c11, b5, c51
+   MADD7  c52, c11, b6, c52
+   NMSUB  c71, c11, b7, c71
+   MADD7  c72, c11, b8, c72
+   MADD8  c31, c12, b4, c31
+   NMSUB  c32, c12, b3, c32
+   MADD8  c51, c12, b6, c51
+   NMSUB  c52, c12, b5, c52
+   MADD8  c71, c12, b8, c71
+   NMSUB  c72, c12, b7, c72
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   LD b5,  BO,  12 * SIZE
+   LD b6,  BO,  13 * SIZE
+   LD b7,  BO,  14 * SIZE
+   LD b8,  BO,  15 * SIZE
+   MUL a1, b4, c32
+   MUL a2, b4, c31
+   MADD5  c31, c31, b3, a1
+   MADD6  c32, c32, b3, a2
+   NMSUB  c51, c31, b5, c51
+   MADD7  c52, c31, b6, c52
+   NMSUB  c71, c31, b7, c71
+   MADD7  c72, c31, b8, c72
+   MADD8  c51, c32, b6, c51
+   NMSUB  c52, c32, b5, c52
+   MADD8  c71, c32, b8, c71
+   NMSUB  c72, c32, b7, c72
+   LD b5,  BO,  20 * SIZE
+   LD b6,  BO,  21 * SIZE
+   LD b7,  BO,  22 * SIZE
+   LD b8,  BO,  23 * SIZE
+   MUL a1, b6, c52
+   MUL a2, b6, c51
+   MADD5  c51, c51, b5, a1
+   MADD6  c52, c52, b5, a2
+   NMSUB  c71, c51, b7, c71
+   MADD7  c72, c51, b8, c72
+   MADD8  c71, c52, b8, c71
+   NMSUB  c72, c52, b7, c72
+   LD b7,  BO,  30 * SIZE
+   LD b8,  BO,  31 * SIZE
+   MUL a1, b8, c72
+   MUL a2, b8, c71
+   MADD5  c71, c71, b7, a1
+   MADD6  c72, c72, b7, a2
+#endif
+#ifdef RT
+   LD b1,  BO,  30 * SIZE
+   LD b2,  BO,  31 * SIZE
+   LD b3,  BO,  28 * SIZE
+   LD b4,  BO,  29 * SIZE
+   LD b5,  BO,  26 * SIZE
+   LD b6,  BO,  27 * SIZE
+   LD b7,  BO,  24 * SIZE
+   LD b8,  BO,  25 * SIZE
+   MUL a1, b2, c72
+   MUL a2, b2, c71
+   MADD5  c71, c71, b1, a1
+   MADD6  c72, c72, b1, a2
+   NMSUB  c51, c71, b3, c51
+   MADD7  c52, c71, b4, c52
+   NMSUB  c31, c71, b5, c31
+   MADD7  c32, c71, b6, c32
+   NMSUB  c11, c71, b7, c11
+   MADD7  c12, c71, b8, c12
+   MADD8  c51, c72, b4, c51
+   NMSUB  c52, c72, b3, c52
+   MADD8  c31, c72, b6, c31
+   NMSUB  c32, c72, b5, c32
+   MADD8  c11, c72, b8, c11
+   NMSUB  c12, c72, b7, c12
+   LD b3,  BO,  20 * SIZE
+   LD b4,  BO,  21 * SIZE
+   LD b5,  BO,  18 * SIZE
+   LD b6,  BO,  19 * SIZE
+   LD b7,  BO,  16 * SIZE
+   LD b8,  BO,  17 * SIZE
+   MUL a1, b4, c52
+   MUL a2, b4, c51
+   MADD5  c51, c51, b3, a1
+   MADD6  c52, c52, b3, a2
+   NMSUB  c31, c51, b5, c31
+   MADD7  c32, c51, b6, c32
+   NMSUB  c11, c51, b7, c11
+   MADD7  c12, c51, b8, c12
+   MADD8  c31, c52, b6, c31
+   NMSUB  c32, c52, b5, c32
+   MADD8  c11, c52, b8, c11
+   NMSUB  c12, c52, b7, c12
+   LD b5,  BO,  10 * SIZE
+   LD b6,  BO,  11 * SIZE
+   LD b7,  BO,   8 * SIZE
+   LD b8,  BO,   9 * SIZE
+   MUL a1, b6, c32
+   MUL a2, b6, c31
+   MADD5  c31, c31, b5, a1
+   MADD6  c32, c32, b5, a2
+   NMSUB  c11, c31, b7, c11
+   MADD7  c12, c31, b8, c12
+   MADD8  c11, c32, b8, c11
+   NMSUB  c12, c32, b7, c12
+   LD b7,  BO,   0 * SIZE
+   LD b8,  BO,   1 * SIZE
+   MUL a1, b8, c12
+   MUL a2, b8, c11
+   MADD5  c11, c11, b7, a1
+   MADD6  c12, c12, b7, a2
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c32,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c52,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c72,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c32,  AO,   3 * SIZE
+   ST c51,  AO,   4 * SIZE
+   ST c52,  AO,   5 * SIZE
+   ST c71,  AO,   6 * SIZE
+   ST c72,  AO,   7 * SIZE
+#endif
+#ifdef LN
+   addi.d  CO1,CO1, -2 * SIZE
+   addi.d  CO2,CO2, -2 * SIZE
+   addi.d  CO3,CO3, -2 * SIZE
+   addi.d  CO4,CO4, -2 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c31,  CO2,   0 * SIZE
+   ST c32,  CO2,   1 * SIZE
+   ST c51,  CO3,   0 * SIZE
+   ST c52,  CO3,   1 * SIZE
+   ST c71,  CO4,   0 * SIZE
+   ST c72,  CO4,   1 * SIZE
+#ifndef LN
+   addi.d  CO1,CO1, 2 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+   addi.d  CO3,CO3, 2 * SIZE
+   addi.d  CO4,CO4, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+MTC  c11, $r0
+   addi.d  I, I, -1
+   MOV c21, c11
+   MOV c31, c11
+   MOV c41, c11
+   MOV c51, c11
+MOV    c61, c11
+   blt $r0,    I, .L11
+   .align 3
+
+.L19:
+#ifdef LN
+   slli.d  TEMP, K, 2 + ZBASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  4
+#endif
+#ifdef RT
+   addi.d  KK, KK, -4
+#endif
+   blt $r0,    J, .L10
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   fld.d  $f24,  $sp,  48
+   fld.d  $f25,  $sp,  56
+   fld.d  $f26,  $sp,  64
+   fld.d  $f27,  $sp,  72
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  88
+   fld.d  $f19,  $sp,  96
+   fld.d  $f20,  $sp, 104
+   fld.d  $f21,  $sp, 112
+#endif
+   addi.d  $sp, $sp, 128
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   EPILOGUE
diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile
new file mode 100644
index 000000000..71e5a87cb
--- /dev/null
+++ b/lapack/laswp/loongarch64/Makefile
@@ -0,0 +1,12 @@
+TOPDIR	= ../../..
+include ../../../Makefile.system
+
+ifndef LASWP
+LASWP	= ../generic/laswp_k.c
+endif
+
+ifndef ZLASWP
+ZLASWP	= ../generic/zlaswp_k.c
+endif
+
+include ../generic/Makefile
diff --git a/param.h b/param.h
index 965b97466..634e0ef5d 100644
--- a/param.h
+++ b/param.h
@@ -2691,6 +2691,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P	16
 #endif
 
+#if defined (LOONGSON3R5)
+#define SNUMOPT         2
+#define DNUMOPT         2
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+
+#define SGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_N 8
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+#define XGEMM_DEFAULT_UNROLL_N 1
+
+#define SGEMM_DEFAULT_UNROLL_M 2
+#define DGEMM_DEFAULT_UNROLL_M 2
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 1
+#define ZGEMM_DEFAULT_UNROLL_M 1
+#define XGEMM_DEFAULT_UNROLL_M 1
+
+#define SGEMM_DEFAULT_P sgemm_p
+#define DGEMM_DEFAULT_P dgemm_p
+#define QGEMM_DEFAULT_P qgemm_p
+#define CGEMM_DEFAULT_P cgemm_p
+#define ZGEMM_DEFAULT_P zgemm_p
+#define XGEMM_DEFAULT_P xgemm_p
+
+#define SGEMM_DEFAULT_R sgemm_r
+#define DGEMM_DEFAULT_R dgemm_r
+#define QGEMM_DEFAULT_R qgemm_r
+#define CGEMM_DEFAULT_R cgemm_r
+#define ZGEMM_DEFAULT_R zgemm_r
+#define XGEMM_DEFAULT_R xgemm_r
+
+#define SGEMM_DEFAULT_Q 128
+#define DGEMM_DEFAULT_Q 128
+#define QGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 128
+#define ZGEMM_DEFAULT_Q 128
+#define XGEMM_DEFAULT_Q 128
+
+#define SYMV_P  16
+#endif
+
 #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500)
 #define SNUMOPT  2
 #define DNUMOPT  2

From 4d7dfe4845078dbe57afed8bb4181451d8cd3734 Mon Sep 17 00:00:00 2001
From: Craig Watson <watsoncraigjohn@gmail.com>
Date: Tue, 27 Jul 2021 09:00:30 +0000
Subject: [PATCH 012/143] Include Haiku in processor count checks

---
 driver/others/memory.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 6e654ccf2..39ed264e8 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -428,7 +428,7 @@ extern int openblas_goto_num_threads_env();
 extern int openblas_omp_num_threads_env();
 
 int blas_get_cpu_number(void){
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   int max_num;
 #endif
   int blas_goto_num   = 0;
@@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
 
   if (blas_num_threads) return blas_num_threads;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   max_num = get_num_procs();
 #endif
 
@@ -460,7 +460,7 @@ int blas_get_cpu_number(void){
   else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
   else blas_num_threads = MAX_CPU_NUMBER;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   if (blas_num_threads > max_num) blas_num_threads = max_num;
 #endif
 
@@ -1979,7 +1979,7 @@ extern int openblas_goto_num_threads_env();
 extern int openblas_omp_num_threads_env();
 
 int blas_get_cpu_number(void){
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   int max_num;
 #endif
   int blas_goto_num   = 0;
@@ -1987,7 +1987,7 @@ int blas_get_cpu_number(void){
 
   if (blas_num_threads) return blas_num_threads;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   max_num = get_num_procs();
 #endif
 
@@ -2011,7 +2011,7 @@ int blas_get_cpu_number(void){
   else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
   else blas_num_threads = MAX_CPU_NUMBER;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   if (blas_num_threads > max_num) blas_num_threads = max_num;
 #endif
 

From 02d4a49761f2ed74e0fe6943c3a3759ebed45ea3 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Thu, 15 Jul 2021 04:54:33 -0500
Subject: [PATCH 013/143] Also make sure the `1` is INTEGER*4 for
 OMP_SET_NUM_THREADS

---
 lapack-netlib/TESTING/EIG/cchkee.F | 8 +++++---
 lapack-netlib/TESTING/EIG/dchkee.F | 5 +++--
 lapack-netlib/TESTING/EIG/schkee.F | 5 +++--
 lapack-netlib/TESTING/EIG/zchkee.F | 8 +++++---
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/lapack-netlib/TESTING/EIG/cchkee.F b/lapack-netlib/TESTING/EIG/cchkee.F
index ab54078a3..ef9f71ec9 100644
--- a/lapack-netlib/TESTING/EIG/cchkee.F
+++ b/lapack-netlib/TESTING/EIG/cchkee.F
@@ -1076,7 +1076,7 @@
       INTEGER            I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
      $                   NK, NN, NPARMS, NRHS, NTYPES,
      $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH
-      INTEGER*4  N_THREADS
+      INTEGER*4          N_THREADS, ONE_THREAD
       REAL               EPS, S1, S2, THRESH, THRSHN
 *     ..
 *     .. Local Arrays ..
@@ -1873,7 +1873,8 @@
          IF( TSTERR ) THEN
 #if defined(_OPENMP)
             N_THREADS = OMP_GET_MAX_THREADS()
-            CALL OMP_SET_NUM_THREADS(1)
+            ONE_THREAD = 1
+            CALL OMP_SET_NUM_THREADS(ONE_THREAD)
 #endif
             CALL CERRST( 'CST', NOUT )
 #if defined(_OPENMP)
@@ -2340,7 +2341,8 @@
          IF( TSTERR ) THEN
 #if defined(_OPENMP)
             N_THREADS = OMP_GET_MAX_THREADS()
-            CALL OMP_SET_NUM_THREADS(1)
+            ONE_THREAD = 1
+            CALL OMP_SET_NUM_THREADS(ONE_THREAD)
 #endif
             CALL CERRST( 'CHB', NOUT )
 #if defined(_OPENMP)
diff --git a/lapack-netlib/TESTING/EIG/dchkee.F b/lapack-netlib/TESTING/EIG/dchkee.F
index 6399fecef..89b6958fe 100644
--- a/lapack-netlib/TESTING/EIG/dchkee.F
+++ b/lapack-netlib/TESTING/EIG/dchkee.F
@@ -1082,7 +1082,7 @@
       INTEGER            I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
      $                   NK, NN, NPARMS, NRHS, NTYPES,
      $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH
-      INTEGER*4 N_THREADS
+      INTEGER*4          N_THREADS, ONE_THREAD
       DOUBLE PRECISION   EPS, S1, S2, THRESH, THRSHN
 *     ..
 *     .. Local Arrays ..
@@ -1878,7 +1878,8 @@
          IF( TSTERR ) THEN
 #if defined(_OPENMP)
             N_THREADS = OMP_GET_MAX_THREADS()
-            CALL OMP_SET_NUM_THREADS(1)
+            ONE_THREAD = 1
+            CALL OMP_SET_NUM_THREADS(ONE_THREAD)
 #endif
             CALL DERRST( 'DST', NOUT )
 #if defined(_OPENMP)
diff --git a/lapack-netlib/TESTING/EIG/schkee.F b/lapack-netlib/TESTING/EIG/schkee.F
index 5484a7c26..b58433959 100644
--- a/lapack-netlib/TESTING/EIG/schkee.F
+++ b/lapack-netlib/TESTING/EIG/schkee.F
@@ -1082,7 +1082,7 @@
       INTEGER            I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
      $                   NK, NN, NPARMS, NRHS, NTYPES,
      $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH
-      INTEGER*4          N_THREADS
+      INTEGER*4          N_THREADS, ONE_THREAD
       REAL               EPS, S1, S2, THRESH, THRSHN
 *     ..
 *     .. Local Arrays ..
@@ -1879,7 +1879,8 @@
          IF( TSTERR ) THEN
 #if defined(_OPENMP)
             N_THREADS = OMP_GET_MAX_THREADS()
-            CALL OMP_SET_NUM_THREADS(1)
+            ONE_THREAD = 1
+            CALL OMP_SET_NUM_THREADS(ONE_THREAD)
 #endif
             CALL SERRST( 'SST', NOUT )
 #if defined(_OPENMP)
diff --git a/lapack-netlib/TESTING/EIG/zchkee.F b/lapack-netlib/TESTING/EIG/zchkee.F
index 7e9144d15..fb418a43b 100644
--- a/lapack-netlib/TESTING/EIG/zchkee.F
+++ b/lapack-netlib/TESTING/EIG/zchkee.F
@@ -1076,7 +1076,7 @@
       INTEGER            I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
      $                   NK, NN, NPARMS, NRHS, NTYPES,
      $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH
-      INTEGER*4          N_THREADS
+      INTEGER*4          N_THREADS, ONE_THREAD
       DOUBLE PRECISION   EPS, S1, S2, THRESH, THRSHN
 *     ..
 *     .. Local Arrays ..
@@ -1873,7 +1873,8 @@
          IF( TSTERR ) THEN
 #if defined(_OPENMP)
             N_THREADS = OMP_GET_MAX_THREADS()
-            CALL OMP_SET_NUM_THREADS(1)
+            ONE_THREAD = 1
+            CALL OMP_SET_NUM_THREADS(ONE_THREAD)
 #endif
             CALL ZERRST( 'ZST', NOUT )
 #if defined(_OPENMP)
@@ -2338,7 +2339,8 @@
          IF( TSTERR ) THEN
 #if defined(_OPENMP)
             N_THREADS = OMP_GET_MAX_THREADS()
-            CALL OMP_SET_NUM_THREADS(1)
+            ONE_THREAD = 1
+            CALL OMP_SET_NUM_THREADS(ONE_THREAD)
 #endif
             CALL ZERRST( 'ZHB', NOUT )
 #if defined(_OPENMP)

From 34207bdf5b91373c08fbebf038b43e5b8c9ed7cf Mon Sep 17 00:00:00 2001
From: gxw <guxiwei@loongson.cn>
Date: Fri, 30 Jul 2021 18:11:12 +0800
Subject: [PATCH 014/143] Fixed typos about LOONGARCH64

---
 Makefile.system      | 2 +-
 common_loongarch64.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 4084390db..13c946ba1 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -856,7 +856,7 @@ BINARY_DEFINED = 1
 endif
 
 ifeq ($(ARCH), loongarch64)
-ifeq ($(CORE), LOONGSONG3R5)
+ifeq ($(CORE), LOONGSON3R5)
 CCOMMON_OPT += -march=loongarch64 -mabi=lp64
 FCOMMON_OPT += -march=loongarch64 -mabi=lp64
 endif
diff --git a/common_loongarch64.h b/common_loongarch64.h
index 959e7e58a..e15539b5f 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -186,7 +186,7 @@ REALNAME: ;\
 
 #define BUFFER_SIZE     ( 32 << 20)
 
-#define PAGESIZE        (16UL << 1)
+#define PAGESIZE        (16UL << 10)
 #define FIXED_PAGESIZE  (16UL << 10)
 #define HUGE_PAGESIZE   ( 2 << 20)
 

From cbc41973fde6137bc42c34de64a41b5a82b597c0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 30 Jul 2021 14:20:12 +0200
Subject: [PATCH 015/143] Disable gfortran tree vectorizer to avoid gcc11+
 miscompilation at O3

---
 ctest/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt
index 17f29fe69..f785d3f90 100644
--- a/ctest/CMakeLists.txt
+++ b/ctest/CMakeLists.txt
@@ -4,6 +4,9 @@ include_directories(${PROJECT_BINARY_DIR})
 enable_language(Fortran)
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
+if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
+	set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
+endif()
 
 if(WIN32)
 FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1

From b4f4ed378b2343b0af8b1235838feef4f6c8c51c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 30 Jul 2021 14:21:08 +0200
Subject: [PATCH 016/143] Disable gfortran tree vectorizer to avoid gcc11+
 miscompilation at O3

---
 test/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d338242ff..e4ee8b28b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -2,6 +2,10 @@ include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_BINARY_DIR})
 
 enable_language(Fortran)
+if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
+        set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
+endif()
+
 
 if (BUILD_SINGLE)
 	list( APPEND OpenBLAS_Tests sblat1 sblat2 sblat3)

From e78fbe46541dedcf39eb0362e69b1de6f7808642 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 30 Jul 2021 14:44:54 +0200
Subject: [PATCH 017/143] Disable gfortran tree vectorizer to avoid gcc11+
 miscompilation at O3

---
 ctest/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ctest/Makefile b/ctest/Makefile
index 15c83a907..c5e1094da 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -6,6 +6,9 @@ TOPDIR = ..
 include $(TOPDIR)/Makefile.system
 
 override CFLAGS += -DADD$(BU) -DCBLAS
+ifeq ($(F_COMPILER),GFORTRAN)
+	override FFLAGS += -fno-tree-vectorize
+endif
 override TARGET_ARCH=
 override TARGET_MACH=
 

From 5dc6aa74f05cc6c4405be195461fa5afc2c03888 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 30 Jul 2021 14:46:19 +0200
Subject: [PATCH 018/143] Disable gfortran tree vectorizer to avoid gcc11+
 miscompilation at O3

---
 test/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/Makefile b/test/Makefile
index 6c5f041c2..923f1537c 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,6 +1,8 @@
 TOPDIR	= ..
 include ../Makefile.system
-
+ifeq ($(F_COMPILER),GFORTRAN)
+        override FFLAGS += -fno-tree-vectorize
+endif
 
 ifeq ($(NOFORTRAN),1)
 all ::

From f2a7a67f5afa31e1e8839e5a386773e45bb5a687 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 31 Jul 2021 17:23:40 +0200
Subject: [PATCH 019/143] Improve the "tried to allocate too many buffers"
 error message

---
 driver/others/memory.c | 38 ++++++++++++++------------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 39ed264e8..f0521ab2d 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -222,11 +222,11 @@ int get_num_procs(void);
 #else
 int get_num_procs(void) {
   static int nums = 0;
-
-#if defined(__GLIBC_PREREQ)
   cpu_set_t cpuset,*cpusetp;
   size_t size;
   int ret;
+
+#if defined(__GLIBC_PREREQ)
 #if !__GLIBC_PREREQ(2, 7)
   int i;
 #if !__GLIBC_PREREQ(2, 6)
@@ -428,7 +428,7 @@ extern int openblas_goto_num_threads_env();
 extern int openblas_omp_num_threads_env();
 
 int blas_get_cpu_number(void){
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   int max_num;
 #endif
   int blas_goto_num   = 0;
@@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
 
   if (blas_num_threads) return blas_num_threads;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   max_num = get_num_procs();
 #endif
 
@@ -460,7 +460,7 @@ int blas_get_cpu_number(void){
   else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
   else blas_num_threads = MAX_CPU_NUMBER;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   if (blas_num_threads > max_num) blas_num_threads = max_num;
 #endif
 
@@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock);
 
       func = &memoryalloc[0];
 
-      while ((*func != NULL) && (map_address == (void *) -1)) {
+      while ((func != NULL) && (map_address == (void *) -1)) {
 
         map_address = (*func)((void *)base_address);
 
@@ -1619,12 +1619,10 @@ static int on_process_term(void)
 #else
 #pragma data_seg(".CRT$XLB")
 #endif
-
+static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
 #ifdef _WIN64
-static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
 #pragma const_seg()
 #else
-static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
 #pragma data_seg()
 #endif
 
@@ -1633,12 +1631,10 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI
 #else
 #pragma data_seg(".CRT$XTU")
 #endif
-
+static int(*p_process_term)(void) = on_process_term;
 #ifdef _WIN64
-static const int(*p_process_term)(void) = on_process_term;
 #pragma const_seg()
 #else
-static int(*p_process_term)(void) = on_process_term;
 #pragma data_seg()
 #endif
 #endif
@@ -1672,23 +1668,16 @@ void gotoblas_dummy_for_PGI(void) {
 #ifndef MEM_LARGE_PAGES
 #define MEM_LARGE_PAGES  0x20000000
 #endif
-#elif !defined(OS_EMBEDDED)
-#define ALLOC_MMAP
-#define ALLOC_MALLOC
 #else
+#define ALLOC_MMAP
 #define ALLOC_MALLOC
-
-inline int puts(const char *str) { return 0; }
-inline int printf(const char *format, ...) { return 0; }
-inline char *getenv(const char *name) { return ""; }
-inline int atoi(const char *str) { return 0; }
 #endif
 
 #include <stdlib.h>
 #include <stdio.h>
 #include <fcntl.h>
 
-#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED)
+#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
 #include <sys/mman.h>
 #ifndef NO_SYSV_IPC
 #include <sys/shm.h>
@@ -1702,6 +1691,7 @@ inline int atoi(const char *str) { return 0; }
 #include <sys/sysinfo.h>
 #include <sched.h>
 #include <errno.h>
+#include <linux/unistd.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -1979,7 +1969,7 @@ extern int openblas_goto_num_threads_env();
 extern int openblas_omp_num_threads_env();
 
 int blas_get_cpu_number(void){
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   int max_num;
 #endif
   int blas_goto_num   = 0;
@@ -1987,7 +1977,7 @@ int blas_get_cpu_number(void){
 
   if (blas_num_threads) return blas_num_threads;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   max_num = get_num_procs();
 #endif
 
@@ -2011,7 +2001,7 @@ int blas_get_cpu_number(void){
   else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
   else blas_num_threads = MAX_CPU_NUMBER;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   if (blas_num_threads > max_num) blas_num_threads = max_num;
 #endif
 

From 0b8f7c8c10957aa1d7836cb8ae55337d180d5a75 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei@loongson.cn>
Date: Mon, 2 Aug 2021 10:00:41 +0800
Subject: [PATCH 020/143] Add cmake support for LOONGARCH64

---
 cmake/arch.cmake          | 4 ++++
 cmake/cc.cmake            | 9 +++++++++
 cmake/fc.cmake            | 7 +++++++
 cmake/system_check.cmake  | 4 +++-
 kernel/loongarch64/KERNEL | 2 ++
 5 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 154e59db6..57ee5a4fb 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -113,6 +113,10 @@ if (MIPS64)
   set(NO_BINARY_MODE 1)
 endif ()
 
+if (LOONGARCH64)
+  set(NO_BINARY_MODE 1)
+endif ()
+
 if (${ARCH} STREQUAL "alpha")
   set(NO_BINARY_MODE 1)
   set(BINARY_DEFINED 1)
diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index ac5e455d5..1794b5e5b 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -29,6 +29,15 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
       set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64")
     endif ()
 
+    if (LOONGARCH64)
+      if (BINARY64)
+        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
+      else ()
+        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
+      endif ()
+      set(BINARY_DEFINED 1)
+    endif ()
+
     if (CMAKE_SYSTEM_NAME STREQUAL "AIX")
       set(BINARY_DEFINED 1)
     endif ()
diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index fc1f9bb22..631664569 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -61,6 +61,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
         set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
       endif ()
     endif ()
+    if (LOONGARCH64)
+      if (BINARY64)
+        set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
+      else ()
+        set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
+      endif ()
+    endif ()
   else ()
     if (BINARY64)
       set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index fdc79c8ce..8d0558c0e 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -38,6 +38,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
   set(PPC 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
   set(MIPS64 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
+  set(LOONGARCH64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
   if (NOT BINARY)
     if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
@@ -95,7 +97,7 @@ else()
 endif ()
 
 if (NOT BINARY)
-  if (X86_64 OR ARM64 OR PPC OR MIPS64)
+  if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64)
     set(BINARY 64)
   else ()
     set(BINARY 32)
diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL
index e96a90e72..1c11df9b6 100644
--- a/kernel/loongarch64/KERNEL
+++ b/kernel/loongarch64/KERNEL
@@ -234,3 +234,5 @@ endif
 ifndef ZGEMM3MKERNEL
 ZGEMM3MKERNEL    =  zgemm3m_kernel.S
 endif
+
+DSDOTKERNEL  = dot.S

From 0a2077901cf94877f6173f6b580762b68b2fd2e0 Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Tue, 28 Apr 2020 19:01:36 +0800
Subject: [PATCH 021/143] Add small marix optimization kernel interface.

make SMALL_MATRIX_OPT=1
---
 Makefile.system                              |  5 ++
 common_d.h                                   |  6 ++
 common_level3.h                              | 12 ++++
 common_macro.h                               | 16 +++++
 common_s.h                                   |  5 ++
 interface/gemm.c                             | 28 +++++++-
 kernel/Makefile.L3                           | 73 ++++++++++++++++++++
 kernel/generic/gemm_small_matrix_kernel_nn.c | 49 +++++++++++++
 kernel/generic/gemm_small_matrix_kernel_nt.c | 49 +++++++++++++
 kernel/generic/gemm_small_matrix_kernel_tn.c | 49 +++++++++++++
 kernel/generic/gemm_small_matrix_kernel_tt.c | 49 +++++++++++++
 11 files changed, 340 insertions(+), 1 deletion(-)
 create mode 100644 kernel/generic/gemm_small_matrix_kernel_nn.c
 create mode 100644 kernel/generic/gemm_small_matrix_kernel_nt.c
 create mode 100644 kernel/generic/gemm_small_matrix_kernel_tn.c
 create mode 100644 kernel/generic/gemm_small_matrix_kernel_tt.c

diff --git a/Makefile.system b/Makefile.system
index 13c946ba1..20d8d2f2a 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -244,6 +244,11 @@ else
 ONLY_CBLAS = 0
 endif
 
+#For small matrix optimization
+ifeq ($(SMALL_MATRIX_OPT), 1)
+CCOMMON_OPT += -DSMALL_MATRIX_OPT
+endif
+
 # This operation is expensive, so execution should be once.
 ifndef GOTOBLAS_MAKEFILE
 export GOTOBLAS_MAKEFILE = 1
diff --git a/common_d.h b/common_d.h
index 94dc3eea8..dad304a5f 100644
--- a/common_d.h
+++ b/common_d.h
@@ -157,6 +157,12 @@
 #define DIMATCOPY_K_RT      dimatcopy_k_rt
 #define DGEADD_K                dgeadd_k 
 
+
+#define DGEMM_SMALL_KERNEL_NN   dgemm_small_kernel_nn
+#define DGEMM_SMALL_KERNEL_NT   dgemm_small_kernel_nt
+#define DGEMM_SMALL_KERNEL_TN   dgemm_small_kernel_tn
+#define DGEMM_SMALL_KERNEL_TT   dgemm_small_kernel_tt
+
 #else
 
 #define	DAMAX_K			gotoblas -> damax_k
diff --git a/common_level3.h b/common_level3.h
index c4f9435a9..751592b67 100644
--- a/common_level3.h
+++ b/common_level3.h
@@ -515,6 +515,18 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble
 int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
 #endif
 
+#ifdef SMALL_MATRIX_OPT
+int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+
+int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+#endif
+
 int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float,  float,  float  *, float  *, float  *, BLASLONG);
 int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float,  float,  float  *, float  *, float  *, BLASLONG);
 int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float,  float,  float  *, float  *, float  *, BLASLONG);
diff --git a/common_macro.h b/common_macro.h
index 0136f18ab..eb2abcdc0 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -644,6 +644,11 @@
 
 #define GEADD_K                 DGEADD_K
 
+#define GEMM_SMALL_KERNEL_NN    DGEMM_SMALL_KERNEL_NN
+#define GEMM_SMALL_KERNEL_NT    DGEMM_SMALL_KERNEL_NT
+#define GEMM_SMALL_KERNEL_TN    DGEMM_SMALL_KERNEL_TN
+#define GEMM_SMALL_KERNEL_TT    DGEMM_SMALL_KERNEL_TT
+
 #elif defined(BFLOAT16)
 
 #define D_TO_BF16_K     SBDTOBF16_K
@@ -931,6 +936,11 @@
 
 #define GEADD_K 		SGEADD_K
 
+#define GEMM_SMALL_KERNEL_NN    SGEMM_SMALL_KERNEL_NN
+#define GEMM_SMALL_KERNEL_NT    SGEMM_SMALL_KERNEL_NT
+#define GEMM_SMALL_KERNEL_TN    SGEMM_SMALL_KERNEL_TN
+#define GEMM_SMALL_KERNEL_TT    SGEMM_SMALL_KERNEL_TT
+
 #endif
 
 #else
@@ -1236,6 +1246,12 @@
 #define IMATCOPY_K_RT		SIMATCOPY_K_RT
 
 #define GEADD_K 		SGEADD_K
+
+#define GEMM_SMALL_KERNEL_NN    SGEMM_SMALL_KERNEL_NN
+#define GEMM_SMALL_KERNEL_NT    SGEMM_SMALL_KERNEL_NT
+#define GEMM_SMALL_KERNEL_TN    SGEMM_SMALL_KERNEL_TN
+#define GEMM_SMALL_KERNEL_TT    SGEMM_SMALL_KERNEL_TT
+
 #endif
 #else
 #ifdef XDOUBLE
diff --git a/common_s.h b/common_s.h
index 34903ec49..6ad98ba8b 100644
--- a/common_s.h
+++ b/common_s.h
@@ -164,6 +164,11 @@
 
 #define SGEADD_K                sgeadd_k 
 
+#define SGEMM_SMALL_KERNEL_NN   sgemm_small_kernel_nn
+#define SGEMM_SMALL_KERNEL_NT   sgemm_small_kernel_nt
+#define SGEMM_SMALL_KERNEL_TN   sgemm_small_kernel_tn
+#define SGEMM_SMALL_KERNEL_TT   sgemm_small_kernel_tt
+
 #else
 
 #define	SAMAX_K			gotoblas -> samax_k
diff --git a/interface/gemm.c b/interface/gemm.c
index 10426fd8f..d2fb42ff7 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -105,6 +105,18 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
 #endif
 };
 
+#ifdef SMALL_MATRIX_OPT
+//Only support s/dgemm small matrix optimiztion so far.
+static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
+#ifndef GEMM3M
+#ifndef COMPLEX
+	GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL,
+	GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL,
+#endif
+#endif
+};
+#endif
+
 #ifndef CBLAS
 
 void NAME(char *TRANSA, char *TRANSB,
@@ -417,6 +429,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
   FUNCTION_PROFILE_START();
 
+  MNK = (double) args.m * (double) args.n * (double) args.k;
+
+#ifdef SMALL_MATRIX_OPT
+#if !defined(COMPLEX)
+  //need to tune small matrices cases.
+  if(MNK <= 100.0*100.0*100.0){
+	  (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b,
+						      args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
+	  return;
+  }
+#endif
+#endif
+  
+
   buffer = (XFLOAT *)blas_memory_alloc(0);
 
   sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);
@@ -428,7 +454,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
   mode |= (transb << BLAS_TRANSB_SHIFT);
 #endif
 
-  MNK = (double) args.m * (double) args.n * (double) args.k;
+
   if ( MNK <= (SMP_THRESHOLD_MIN  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
 	args.nthreads = 1;
   else
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 2d9e3ec36..88e5eb2d6 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -447,6 +447,19 @@ XBLASOBJS += \
 
 endif
 
+######  BLAS small matrix optimization #####
+ifeq ($(SMALL_MATRIX_OPT), 1)
+
+SBLASOBJS += \
+	sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
+	sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX)
+
+DBLASOBJS += \
+	dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
+	dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX)
+
+endif
+
 ######  BLAS extensions #####
 
 ifeq ($(BUILD_SINGLE),1)
@@ -4237,3 +4250,63 @@ endif
 $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@
 
+
+
+######  BLAS small matrix optimization #####
+
+ifndef DGEMM_SAMLL_K_NN
+DGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
+endif
+
+ifndef DGEMM_SAMLL_K_NT
+DGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
+endif
+
+ifndef DGEMM_SAMLL_K_TN
+DGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
+endif
+
+ifndef DGEMM_SAMLL_K_TT
+DGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
+
+ifndef SGEMM_SAMLL_K_NN
+SGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
+endif
+
+ifndef SGEMM_SAMLL_K_NT
+SGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
+endif
+
+ifndef SGEMM_SAMLL_K_TN
+SGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
+endif
+
+ifndef SGEMM_SAMLL_K_TT
+SGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
diff --git a/kernel/generic/gemm_small_matrix_kernel_nn.c b/kernel/generic/gemm_small_matrix_kernel_nn.c
new file mode 100644
index 000000000..efcc27cba
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_kernel_nn.c
@@ -0,0 +1,49 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+{
+	//naive implemtation
+	//Column major
+	
+	BLASLONG i,j,k;
+	FLOAT result=0.0;
+
+	for(i=0; i<M; i++){
+		for(j=0; j<N; j++){
+			result=0.0;
+			for(k=0; k<K; k++){
+				result += A[i+k*lda] * B[k+j*ldb];
+			}
+			C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/gemm_small_matrix_kernel_nt.c b/kernel/generic/gemm_small_matrix_kernel_nt.c
new file mode 100644
index 000000000..e8d9a6c2c
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_kernel_nt.c
@@ -0,0 +1,49 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+{
+	//naive implemtation
+	//Column major
+
+	BLASLONG i,j,k;
+	FLOAT result=0.0;
+
+	for(i=0; i<M; i++){
+		for(j=0; j<N; j++){
+			result=0.0;
+			for(k=0; k<K; k++){
+				result += A[i+k*lda] * B[k*ldb+j];
+			}
+			C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/gemm_small_matrix_kernel_tn.c b/kernel/generic/gemm_small_matrix_kernel_tn.c
new file mode 100644
index 000000000..f7b7f2fcb
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_kernel_tn.c
@@ -0,0 +1,49 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+{
+	//naive implemtation
+	//Column major
+
+	BLASLONG i,j,k;
+	FLOAT result=0.0;
+
+	for(i=0; i<M; i++){
+		for(j=0; j<N; j++){
+			result=0.0;
+			for(k=0; k<K; k++){
+				result += A[i*lda+k] * B[k+j*ldb];
+			}
+			C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
+		}
+	}
+
+	return 0;
+}
diff --git a/kernel/generic/gemm_small_matrix_kernel_tt.c b/kernel/generic/gemm_small_matrix_kernel_tt.c
new file mode 100644
index 000000000..40a5b1b56
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_kernel_tt.c
@@ -0,0 +1,49 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+{
+	//naive implemtation
+	//Column major
+
+	BLASLONG i,j,k;
+	FLOAT result=0.0;
+
+	for(i=0; i<M; i++){
+		for(j=0; j<N; j++){
+			result=0.0;
+			for(k=0; k<K; k++){
+				result += A[i*lda+k] * B[k*ldb+j];
+			}
+			C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
+		}
+	}
+
+	return 0;
+}

From be3349405d1b3e09d0084d77f0167cb0b4ed2aae Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Tue, 28 Apr 2020 22:35:36 +0800
Subject: [PATCH 022/143] Add alpha=1.0 beta=0.0 for small gemm.

---
 common_d.h                                    |  5 ++
 common_level3.h                               | 11 ++++
 common_macro.h                                | 14 ++++
 common_s.h                                    |  5 ++
 interface/gemm.c                              | 18 +++++-
 kernel/Makefile.L3                            | 64 ++++++++++++++++++-
 .../gemm_small_matrix_kernel_a1b0_nn.c        | 49 ++++++++++++++
 .../gemm_small_matrix_kernel_a1b0_nt.c        | 49 ++++++++++++++
 .../gemm_small_matrix_kernel_a1b0_tn.c        | 49 ++++++++++++++
 .../gemm_small_matrix_kernel_a1b0_tt.c        | 49 ++++++++++++++
 10 files changed, 309 insertions(+), 4 deletions(-)
 create mode 100644 kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c
 create mode 100644 kernel/generic/gemm_small_matrix_kernel_a1b0_nt.c
 create mode 100644 kernel/generic/gemm_small_matrix_kernel_a1b0_tn.c
 create mode 100644 kernel/generic/gemm_small_matrix_kernel_a1b0_tt.c

diff --git a/common_d.h b/common_d.h
index dad304a5f..f5d7935fa 100644
--- a/common_d.h
+++ b/common_d.h
@@ -163,6 +163,11 @@
 #define DGEMM_SMALL_KERNEL_TN   dgemm_small_kernel_tn
 #define DGEMM_SMALL_KERNEL_TT   dgemm_small_kernel_tt
 
+#define DGEMM_SMALL_KERNEL_A1B0_NN   dgemm_small_kernel_a1b0_nn
+#define DGEMM_SMALL_KERNEL_A1B0_NT   dgemm_small_kernel_a1b0_nt
+#define DGEMM_SMALL_KERNEL_A1B0_TN   dgemm_small_kernel_a1b0_tn
+#define DGEMM_SMALL_KERNEL_A1B0_TT   dgemm_small_kernel_a1b0_tt
+
 #else
 
 #define	DAMAX_K			gotoblas -> damax_k
diff --git a/common_level3.h b/common_level3.h
index 751592b67..31d514cd5 100644
--- a/common_level3.h
+++ b/common_level3.h
@@ -525,6 +525,17 @@ int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLO
 int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
 int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
 int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+
+int sgemm_small_kernel_a1b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sgemm_small_kernel_a1b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sgemm_small_kernel_a1b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sgemm_small_kernel_a1b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+
+int dgemm_small_kernel_a1b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+int dgemm_small_kernel_a1b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+int dgemm_small_kernel_a1b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+int dgemm_small_kernel_a1b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+
 #endif
 
 int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float,  float,  float  *, float  *, float  *, BLASLONG);
diff --git a/common_macro.h b/common_macro.h
index eb2abcdc0..2f7263023 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -648,6 +648,10 @@
 #define GEMM_SMALL_KERNEL_NT    DGEMM_SMALL_KERNEL_NT
 #define GEMM_SMALL_KERNEL_TN    DGEMM_SMALL_KERNEL_TN
 #define GEMM_SMALL_KERNEL_TT    DGEMM_SMALL_KERNEL_TT
+#define GEMM_SMALL_KERNEL_A1B0_NN    DGEMM_SMALL_KERNEL_A1B0_NN
+#define GEMM_SMALL_KERNEL_A1B0_NT    DGEMM_SMALL_KERNEL_A1B0_NT
+#define GEMM_SMALL_KERNEL_A1B0_TN    DGEMM_SMALL_KERNEL_A1B0_TN
+#define GEMM_SMALL_KERNEL_A1B0_TT    DGEMM_SMALL_KERNEL_A1B0_TT
 
 #elif defined(BFLOAT16)
 
@@ -941,6 +945,11 @@
 #define GEMM_SMALL_KERNEL_TN    SGEMM_SMALL_KERNEL_TN
 #define GEMM_SMALL_KERNEL_TT    SGEMM_SMALL_KERNEL_TT
 
+#define GEMM_SMALL_KERNEL_A1B0_NN    SGEMM_SMALL_KERNEL_A1B0_NN
+#define GEMM_SMALL_KERNEL_A1B0_NT    SGEMM_SMALL_KERNEL_A1B0_NT
+#define GEMM_SMALL_KERNEL_A1B0_TN    SGEMM_SMALL_KERNEL_A1B0_TN
+#define GEMM_SMALL_KERNEL_A1B0_TT    SGEMM_SMALL_KERNEL_A1B0_TT
+
 #endif
 
 #else
@@ -1252,6 +1261,11 @@
 #define GEMM_SMALL_KERNEL_TN    SGEMM_SMALL_KERNEL_TN
 #define GEMM_SMALL_KERNEL_TT    SGEMM_SMALL_KERNEL_TT
 
+#define GEMM_SMALL_KERNEL_A1B0_NN    SGEMM_SMALL_KERNEL_A1B0_NN
+#define GEMM_SMALL_KERNEL_A1B0_NT    SGEMM_SMALL_KERNEL_A1B0_NT
+#define GEMM_SMALL_KERNEL_A1B0_TN    SGEMM_SMALL_KERNEL_A1B0_TN
+#define GEMM_SMALL_KERNEL_A1B0_TT    SGEMM_SMALL_KERNEL_A1B0_TT
+
 #endif
 #else
 #ifdef XDOUBLE
diff --git a/common_s.h b/common_s.h
index 6ad98ba8b..440b78723 100644
--- a/common_s.h
+++ b/common_s.h
@@ -169,6 +169,11 @@
 #define SGEMM_SMALL_KERNEL_TN   sgemm_small_kernel_tn
 #define SGEMM_SMALL_KERNEL_TT   sgemm_small_kernel_tt
 
+#define SGEMM_SMALL_KERNEL_A1B0_NN   sgemm_small_kernel_a1b0_nn
+#define SGEMM_SMALL_KERNEL_A1B0_NT   sgemm_small_kernel_a1b0_nt
+#define SGEMM_SMALL_KERNEL_A1B0_TN   sgemm_small_kernel_a1b0_tn
+#define SGEMM_SMALL_KERNEL_A1B0_TT   sgemm_small_kernel_a1b0_tt
+
 #else
 
 #define	SAMAX_K			gotoblas -> samax_k
diff --git a/interface/gemm.c b/interface/gemm.c
index d2fb42ff7..da602f7a9 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -115,6 +115,15 @@ static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLON
 #endif
 #endif
 };
+
+static int (*gemm_small_kernel_a1b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
+#ifndef GEMM3M
+#ifndef COMPLEX
+	GEMM_SMALL_KERNEL_A1B0_NN, GEMM_SMALL_KERNEL_A1B0_TN, NULL, NULL,
+	GEMM_SMALL_KERNEL_A1B0_NT, GEMM_SMALL_KERNEL_A1B0_TT, NULL, NULL,
+#endif
+#endif
+};
 #endif
 
 #ifndef CBLAS
@@ -435,8 +444,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 #if !defined(COMPLEX)
   //need to tune small matrices cases.
   if(MNK <= 100.0*100.0*100.0){
-	  (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b,
-						      args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
+
+	  if(*(FLOAT *)(args.alpha) == 1.0 && *(FLOAT *)(args.beta) == 0.0){
+		  (gemm_small_kernel_a1b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda,args.b, args.ldb, args.c, args.ldc);
+	  }else{
+		  (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
+	  }
+	  
 	  return;
   }
 #endif
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 88e5eb2d6..448d22e4e 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -452,11 +452,15 @@ ifeq ($(SMALL_MATRIX_OPT), 1)
 
 SBLASOBJS += \
 	sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
-	sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX)
+	sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
+	sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) \
+	sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX)
 
 DBLASOBJS += \
 	dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
-	dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX)
+	dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
+	dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) \
+	dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX)
 
 endif
 
@@ -4282,6 +4286,34 @@ $(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_
 $(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TT)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
+ifndef DGEMM_SAMLL_K_A1B0_NN
+DGEMM_SAMLL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c
+endif
+
+ifndef DGEMM_SAMLL_K_A1B0_NT
+DGEMM_SAMLL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c
+endif
+
+ifndef DGEMM_SAMLL_K_A1B0_TN
+DGEMM_SAMLL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c
+endif
+
+ifndef DGEMM_SAMLL_K_A1B0_TT
+DGEMM_SAMLL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c
+endif
+
+$(KDIR)dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
 
 ifndef SGEMM_SAMLL_K_NN
 SGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
@@ -4310,3 +4342,31 @@ $(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_
 
 $(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+ifndef SGEMM_SAMLL_K_A1B0_NN
+SGEMM_SAMLL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c
+endif
+
+ifndef SGEMM_SAMLL_K_A1B0_NT
+SGEMM_SAMLL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c
+endif
+
+ifndef SGEMM_SAMLL_K_A1B0_TN
+SGEMM_SAMLL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c
+endif
+
+ifndef SGEMM_SAMLL_K_A1B0_TT
+SGEMM_SAMLL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c
+endif
+
+$(KDIR)sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
diff --git a/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c b/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c
new file mode 100644
index 000000000..8e3417027
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c
@@ -0,0 +1,49 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
+{
+	//naive implemtation
+	//Column major
+	
+	BLASLONG i,j,k;
+	FLOAT result=0.0;
+
+	for(i=0; i<M; i++){
+		for(j=0; j<N; j++){
+			result=0.0;
+			for(k=0; k<K; k++){
+				result += A[i+k*lda] * B[k+j*ldb];
+			}
+			C[i+j*ldc]=result;
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/gemm_small_matrix_kernel_a1b0_nt.c b/kernel/generic/gemm_small_matrix_kernel_a1b0_nt.c
new file mode 100644
index 000000000..d5da3c02f
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_kernel_a1b0_nt.c
@@ -0,0 +1,49 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+{
+	//naive implemtation
+	//Column major
+
+	BLASLONG i,j,k;
+	FLOAT result=0.0;
+
+	for(i=0; i<M; i++){
+		for(j=0; j<N; j++){
+			result=0.0;
+			for(k=0; k<K; k++){
+				result += A[i+k*lda] * B[k*ldb+j];
+			}
+			C[i+j*ldc]=result;
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/gemm_small_matrix_kernel_a1b0_tn.c b/kernel/generic/gemm_small_matrix_kernel_a1b0_tn.c
new file mode 100644
index 000000000..c720f2ae3
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_kernel_a1b0_tn.c
@@ -0,0 +1,49 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda,FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
+{
+	//naive implemtation
+	//Column major
+
+	BLASLONG i,j,k;
+	FLOAT result=0.0;
+
+	for(i=0; i<M; i++){
+		for(j=0; j<N; j++){
+			result=0.0;
+			for(k=0; k<K; k++){
+				result += A[i*lda+k] * B[k+j*ldb];
+			}
+			C[i+j*ldc]=result;
+		}
+	}
+
+	return 0;
+}
diff --git a/kernel/generic/gemm_small_matrix_kernel_a1b0_tt.c b/kernel/generic/gemm_small_matrix_kernel_a1b0_tt.c
new file mode 100644
index 000000000..acc6739f1
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_kernel_a1b0_tt.c
@@ -0,0 +1,49 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+{
+	//naive implemtation
+	//Column major
+
+	BLASLONG i,j,k;
+	FLOAT result=0.0;
+
+	for(i=0; i<M; i++){
+		for(j=0; j<N; j++){
+			result=0.0;
+			for(k=0; k<K; k++){
+				result += A[i*lda+k] * B[k*ldb+j];
+			}
+			C[i+j*ldc]=result;
+		}
+	}
+
+	return 0;
+}

From 4271cfcc6f936da76743d9060094cd45fcd2b6c5 Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Tue, 28 Apr 2020 23:15:20 +0800
Subject: [PATCH 023/143] Fix gemm interface bug for small matrix.

---
 interface/gemm.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/interface/gemm.c b/interface/gemm.c
index da602f7a9..4f1bbfd1c 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -145,7 +145,7 @@ void NAME(char *TRANSA, char *TRANSB,
   IFLOAT *buffer;
   IFLOAT *sa, *sb;
 
-#ifdef SMP
+#if defined (SMP) || defined(SMALL_MATRIX_OPT)
   double MNK;
 #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
 #ifndef COMPLEX
@@ -269,8 +269,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
   XFLOAT *buffer;
   XFLOAT *sa, *sb;
 
-#ifdef SMP
+#if defined (SMP) || defined(SMALL_MATRIX_OPT)
   double MNK;
+#endif
+
+#ifdef SMP
 #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
 #ifndef COMPLEX
 #ifdef XDOUBLE
@@ -438,7 +441,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
   FUNCTION_PROFILE_START();
 
+#if defined(SMP) || defined(SMALL_MATRIX_OPT)
   MNK = (double) args.m * (double) args.n * (double) args.k;
+#endif
 
 #ifdef SMALL_MATRIX_OPT
 #if !defined(COMPLEX)

From 59cb5de46b89a080d1190e89bed543fd32f924c7 Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Wed, 29 Apr 2020 00:19:19 +0800
Subject: [PATCH 024/143] Refs #2587 Fix typos.

---
 kernel/Makefile.L3 | 96 +++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 448d22e4e..6476334e9 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -4258,115 +4258,115 @@ $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
 
 ######  BLAS small matrix optimization #####
 
-ifndef DGEMM_SAMLL_K_NN
-DGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
+ifndef DGEMM_SMALL_K_NN
+DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
 endif
 
-ifndef DGEMM_SAMLL_K_NT
-DGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
+ifndef DGEMM_SMALL_K_NT
+DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
 endif
 
-ifndef DGEMM_SAMLL_K_TN
-DGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
+ifndef DGEMM_SMALL_K_TN
+DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
 endif
 
-ifndef DGEMM_SAMLL_K_TT
-DGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
+ifndef DGEMM_SMALL_K_TT
+DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
 endif
 
-$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NN)
+$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NT)
+$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TN)
+$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TT)
+$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
-ifndef DGEMM_SAMLL_K_A1B0_NN
-DGEMM_SAMLL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c
+ifndef DGEMM_SMALL_K_A1B0_NN
+DGEMM_SMALL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c
 endif
 
-ifndef DGEMM_SAMLL_K_A1B0_NT
-DGEMM_SAMLL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c
+ifndef DGEMM_SMALL_K_A1B0_NT
+DGEMM_SMALL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c
 endif
 
-ifndef DGEMM_SAMLL_K_A1B0_TN
-DGEMM_SAMLL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c
+ifndef DGEMM_SMALL_K_A1B0_TN
+DGEMM_SMALL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c
 endif
 
-ifndef DGEMM_SAMLL_K_A1B0_TT
-DGEMM_SAMLL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c
+ifndef DGEMM_SMALL_K_A1B0_TT
+DGEMM_SMALL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c
 endif
 
-$(KDIR)dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_NN)
+$(KDIR)dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_NN)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_NT)
+$(KDIR)dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_NT)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_TN)
+$(KDIR)dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_TN)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_TT)
+$(KDIR)dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_TT)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
 
-ifndef SGEMM_SAMLL_K_NN
-SGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
+ifndef SGEMM_SMALL_K_NN
+SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
 endif
 
-ifndef SGEMM_SAMLL_K_NT
-SGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
+ifndef SGEMM_SMALL_K_NT
+SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
 endif
 
-ifndef SGEMM_SAMLL_K_TN
-SGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
+ifndef SGEMM_SMALL_K_TN
+SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
 endif
 
-ifndef SGEMM_SAMLL_K_TT
-SGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
+ifndef SGEMM_SMALL_K_TT
+SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
 endif
 
-$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NN)
+$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NT)
+$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TN)
+$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TT)
+$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
-ifndef SGEMM_SAMLL_K_A1B0_NN
-SGEMM_SAMLL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c
+ifndef SGEMM_SMALL_K_A1B0_NN
+SGEMM_SMALL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c
 endif
 
-ifndef SGEMM_SAMLL_K_A1B0_NT
-SGEMM_SAMLL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c
+ifndef SGEMM_SMALL_K_A1B0_NT
+SGEMM_SMALL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c
 endif
 
-ifndef SGEMM_SAMLL_K_A1B0_TN
-SGEMM_SAMLL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c
+ifndef SGEMM_SMALL_K_A1B0_TN
+SGEMM_SMALL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c
 endif
 
-ifndef SGEMM_SAMLL_K_A1B0_TT
-SGEMM_SAMLL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c
+ifndef SGEMM_SMALL_K_A1B0_TT
+SGEMM_SMALL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c
 endif
 
-$(KDIR)sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_NN)
+$(KDIR)sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_NN)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_NT)
+$(KDIR)sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_NT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_TN)
+$(KDIR)sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_TN)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_TT)
+$(KDIR)sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_TT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@

From 17d32a4a8271141be2fb96c8c767ac1ed2e60a36 Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Fri, 28 Aug 2020 07:55:27 +0800
Subject: [PATCH 025/143] Change a1b0 gemm to b0 gemm.

---
 common_d.h                                    |  8 +--
 common_level3.h                               | 18 +++---
 common_macro.h                                | 24 ++++----
 common_s.h                                    |  8 +--
 interface/gemm.c                              | 10 ++--
 kernel/Makefile.L3                            | 56 +++++++++----------
 ..._nn.c => gemm_small_matrix_kernel_b0_nn.c} |  4 +-
 ..._nt.c => gemm_small_matrix_kernel_b0_nt.c} |  4 +-
 ..._tn.c => gemm_small_matrix_kernel_b0_tn.c} |  4 +-
 ..._tt.c => gemm_small_matrix_kernel_b0_tt.c} |  4 +-
 10 files changed, 70 insertions(+), 70 deletions(-)
 rename kernel/generic/{gemm_small_matrix_kernel_a1b0_nn.c => gemm_small_matrix_kernel_b0_nn.c} (95%)
 rename kernel/generic/{gemm_small_matrix_kernel_a1b0_nt.c => gemm_small_matrix_kernel_b0_nt.c} (95%)
 rename kernel/generic/{gemm_small_matrix_kernel_a1b0_tn.c => gemm_small_matrix_kernel_b0_tn.c} (95%)
 rename kernel/generic/{gemm_small_matrix_kernel_a1b0_tt.c => gemm_small_matrix_kernel_b0_tt.c} (95%)

diff --git a/common_d.h b/common_d.h
index f5d7935fa..42c14e828 100644
--- a/common_d.h
+++ b/common_d.h
@@ -163,10 +163,10 @@
 #define DGEMM_SMALL_KERNEL_TN   dgemm_small_kernel_tn
 #define DGEMM_SMALL_KERNEL_TT   dgemm_small_kernel_tt
 
-#define DGEMM_SMALL_KERNEL_A1B0_NN   dgemm_small_kernel_a1b0_nn
-#define DGEMM_SMALL_KERNEL_A1B0_NT   dgemm_small_kernel_a1b0_nt
-#define DGEMM_SMALL_KERNEL_A1B0_TN   dgemm_small_kernel_a1b0_tn
-#define DGEMM_SMALL_KERNEL_A1B0_TT   dgemm_small_kernel_a1b0_tt
+#define DGEMM_SMALL_KERNEL_B0_NN   dgemm_small_kernel_b0_nn
+#define DGEMM_SMALL_KERNEL_B0_NT   dgemm_small_kernel_b0_nt
+#define DGEMM_SMALL_KERNEL_B0_TN   dgemm_small_kernel_b0_tn
+#define DGEMM_SMALL_KERNEL_B0_TT   dgemm_small_kernel_b0_tt
 
 #else
 
diff --git a/common_level3.h b/common_level3.h
index 31d514cd5..7be7ab06b 100644
--- a/common_level3.h
+++ b/common_level3.h
@@ -526,15 +526,15 @@ int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLO
 int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
 int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
 
-int sgemm_small_kernel_a1b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
-int sgemm_small_kernel_a1b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
-int sgemm_small_kernel_a1b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
-int sgemm_small_kernel_a1b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
-
-int dgemm_small_kernel_a1b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
-int dgemm_small_kernel_a1b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
-int dgemm_small_kernel_a1b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
-int dgemm_small_kernel_a1b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+
+int dgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
 
 #endif
 
diff --git a/common_macro.h b/common_macro.h
index 2f7263023..fa7884180 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -648,10 +648,10 @@
 #define GEMM_SMALL_KERNEL_NT    DGEMM_SMALL_KERNEL_NT
 #define GEMM_SMALL_KERNEL_TN    DGEMM_SMALL_KERNEL_TN
 #define GEMM_SMALL_KERNEL_TT    DGEMM_SMALL_KERNEL_TT
-#define GEMM_SMALL_KERNEL_A1B0_NN    DGEMM_SMALL_KERNEL_A1B0_NN
-#define GEMM_SMALL_KERNEL_A1B0_NT    DGEMM_SMALL_KERNEL_A1B0_NT
-#define GEMM_SMALL_KERNEL_A1B0_TN    DGEMM_SMALL_KERNEL_A1B0_TN
-#define GEMM_SMALL_KERNEL_A1B0_TT    DGEMM_SMALL_KERNEL_A1B0_TT
+#define GEMM_SMALL_KERNEL_B0_NN    DGEMM_SMALL_KERNEL_B0_NN
+#define GEMM_SMALL_KERNEL_B0_NT    DGEMM_SMALL_KERNEL_B0_NT
+#define GEMM_SMALL_KERNEL_B0_TN    DGEMM_SMALL_KERNEL_B0_TN
+#define GEMM_SMALL_KERNEL_B0_TT    DGEMM_SMALL_KERNEL_B0_TT
 
 #elif defined(BFLOAT16)
 
@@ -945,10 +945,10 @@
 #define GEMM_SMALL_KERNEL_TN    SGEMM_SMALL_KERNEL_TN
 #define GEMM_SMALL_KERNEL_TT    SGEMM_SMALL_KERNEL_TT
 
-#define GEMM_SMALL_KERNEL_A1B0_NN    SGEMM_SMALL_KERNEL_A1B0_NN
-#define GEMM_SMALL_KERNEL_A1B0_NT    SGEMM_SMALL_KERNEL_A1B0_NT
-#define GEMM_SMALL_KERNEL_A1B0_TN    SGEMM_SMALL_KERNEL_A1B0_TN
-#define GEMM_SMALL_KERNEL_A1B0_TT    SGEMM_SMALL_KERNEL_A1B0_TT
+#define GEMM_SMALL_KERNEL_B0_NN    SGEMM_SMALL_KERNEL_B0_NN
+#define GEMM_SMALL_KERNEL_B0_NT    SGEMM_SMALL_KERNEL_B0_NT
+#define GEMM_SMALL_KERNEL_B0_TN    SGEMM_SMALL_KERNEL_B0_TN
+#define GEMM_SMALL_KERNEL_B0_TT    SGEMM_SMALL_KERNEL_B0_TT
 
 #endif
 
@@ -1261,10 +1261,10 @@
 #define GEMM_SMALL_KERNEL_TN    SGEMM_SMALL_KERNEL_TN
 #define GEMM_SMALL_KERNEL_TT    SGEMM_SMALL_KERNEL_TT
 
-#define GEMM_SMALL_KERNEL_A1B0_NN    SGEMM_SMALL_KERNEL_A1B0_NN
-#define GEMM_SMALL_KERNEL_A1B0_NT    SGEMM_SMALL_KERNEL_A1B0_NT
-#define GEMM_SMALL_KERNEL_A1B0_TN    SGEMM_SMALL_KERNEL_A1B0_TN
-#define GEMM_SMALL_KERNEL_A1B0_TT    SGEMM_SMALL_KERNEL_A1B0_TT
+#define GEMM_SMALL_KERNEL_B0_NN    SGEMM_SMALL_KERNEL_B0_NN
+#define GEMM_SMALL_KERNEL_B0_NT    SGEMM_SMALL_KERNEL_B0_NT
+#define GEMM_SMALL_KERNEL_B0_TN    SGEMM_SMALL_KERNEL_B0_TN
+#define GEMM_SMALL_KERNEL_B0_TT    SGEMM_SMALL_KERNEL_B0_TT
 
 #endif
 #else
diff --git a/common_s.h b/common_s.h
index 440b78723..685d73062 100644
--- a/common_s.h
+++ b/common_s.h
@@ -169,10 +169,10 @@
 #define SGEMM_SMALL_KERNEL_TN   sgemm_small_kernel_tn
 #define SGEMM_SMALL_KERNEL_TT   sgemm_small_kernel_tt
 
-#define SGEMM_SMALL_KERNEL_A1B0_NN   sgemm_small_kernel_a1b0_nn
-#define SGEMM_SMALL_KERNEL_A1B0_NT   sgemm_small_kernel_a1b0_nt
-#define SGEMM_SMALL_KERNEL_A1B0_TN   sgemm_small_kernel_a1b0_tn
-#define SGEMM_SMALL_KERNEL_A1B0_TT   sgemm_small_kernel_a1b0_tt
+#define SGEMM_SMALL_KERNEL_B0_NN   sgemm_small_kernel_b0_nn
+#define SGEMM_SMALL_KERNEL_B0_NT   sgemm_small_kernel_b0_nt
+#define SGEMM_SMALL_KERNEL_B0_TN   sgemm_small_kernel_b0_tn
+#define SGEMM_SMALL_KERNEL_B0_TT   sgemm_small_kernel_b0_tt
 
 #else
 
diff --git a/interface/gemm.c b/interface/gemm.c
index 4f1bbfd1c..3730f37fa 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -116,11 +116,11 @@ static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLON
 #endif
 };
 
-static int (*gemm_small_kernel_a1b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
+static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
 #ifndef GEMM3M
 #ifndef COMPLEX
-	GEMM_SMALL_KERNEL_A1B0_NN, GEMM_SMALL_KERNEL_A1B0_TN, NULL, NULL,
-	GEMM_SMALL_KERNEL_A1B0_NT, GEMM_SMALL_KERNEL_A1B0_TT, NULL, NULL,
+	GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, NULL, NULL,
+	GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, NULL, NULL,
 #endif
 #endif
 };
@@ -450,8 +450,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
   //need to tune small matrices cases.
   if(MNK <= 100.0*100.0*100.0){
 
-	  if(*(FLOAT *)(args.alpha) == 1.0 && *(FLOAT *)(args.beta) == 0.0){
-		  (gemm_small_kernel_a1b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda,args.b, args.ldb, args.c, args.ldc);
+	  if(*(FLOAT *)(args.beta) == 0.0){
+		  (gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc);
 	  }else{
 		  (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
 	  }
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 6476334e9..c9544086a 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -453,14 +453,14 @@ ifeq ($(SMALL_MATRIX_OPT), 1)
 SBLASOBJS += \
 	sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
 	sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
-	sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) \
-	sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX)
+	sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
+	sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
 
 DBLASOBJS += \
 	dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
 	dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
-	dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) \
-	dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX)
+	dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
+	dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
 
 endif
 
@@ -4286,32 +4286,32 @@ $(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_
 $(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
-ifndef DGEMM_SMALL_K_A1B0_NN
-DGEMM_SMALL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c
+ifndef DGEMM_SMALL_K_B0_NN
+DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c
 endif
 
-ifndef DGEMM_SMALL_K_A1B0_NT
-DGEMM_SMALL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c
+ifndef DGEMM_SMALL_K_B0_NT
+DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c
 endif
 
-ifndef DGEMM_SMALL_K_A1B0_TN
-DGEMM_SMALL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c
+ifndef DGEMM_SMALL_K_B0_TN
+DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c
 endif
 
-ifndef DGEMM_SMALL_K_A1B0_TT
-DGEMM_SMALL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c
+ifndef DGEMM_SMALL_K_B0_TT
+DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c
 endif
 
-$(KDIR)dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_NN)
+$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_NT)
+$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_TN)
+$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_TT)
+$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
 
@@ -4343,30 +4343,30 @@ $(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_
 $(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
-ifndef SGEMM_SMALL_K_A1B0_NN
-SGEMM_SMALL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c
+ifndef SGEMM_SMALL_K_B0_NN
+SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c
 endif
 
-ifndef SGEMM_SMALL_K_A1B0_NT
-SGEMM_SMALL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c
+ifndef SGEMM_SMALL_K_B0_NT
+SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c
 endif
 
-ifndef SGEMM_SMALL_K_A1B0_TN
-SGEMM_SMALL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c
+ifndef SGEMM_SMALL_K_B0_TN
+SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c
 endif
 
-ifndef SGEMM_SMALL_K_A1B0_TT
-SGEMM_SMALL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c
+ifndef SGEMM_SMALL_K_B0_TT
+SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c
 endif
 
-$(KDIR)sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_NN)
+$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_NT)
+$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_TN)
+$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
-$(KDIR)sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_TT)
+$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
diff --git a/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c b/kernel/generic/gemm_small_matrix_kernel_b0_nn.c
similarity index 95%
rename from kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c
rename to kernel/generic/gemm_small_matrix_kernel_b0_nn.c
index 8e3417027..3be918017 100644
--- a/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c
+++ b/kernel/generic/gemm_small_matrix_kernel_b0_nn.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
 {
 	//naive implemtation
 	//Column major
@@ -41,7 +41,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B
 			for(k=0; k<K; k++){
 				result += A[i+k*lda] * B[k+j*ldb];
 			}
-			C[i+j*ldc]=result;
+			C[i+j*ldc]=alpha * result;
 		}
 	}
 	
diff --git a/kernel/generic/gemm_small_matrix_kernel_a1b0_nt.c b/kernel/generic/gemm_small_matrix_kernel_b0_nt.c
similarity index 95%
rename from kernel/generic/gemm_small_matrix_kernel_a1b0_nt.c
rename to kernel/generic/gemm_small_matrix_kernel_b0_nt.c
index d5da3c02f..0ec2045c6 100644
--- a/kernel/generic/gemm_small_matrix_kernel_a1b0_nt.c
+++ b/kernel/generic/gemm_small_matrix_kernel_b0_nt.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
 {
 	//naive implemtation
 	//Column major
@@ -41,7 +41,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B
 			for(k=0; k<K; k++){
 				result += A[i+k*lda] * B[k*ldb+j];
 			}
-			C[i+j*ldc]=result;
+			C[i+j*ldc]=alpha * result;
 		}
 	}
 	
diff --git a/kernel/generic/gemm_small_matrix_kernel_a1b0_tn.c b/kernel/generic/gemm_small_matrix_kernel_b0_tn.c
similarity index 95%
rename from kernel/generic/gemm_small_matrix_kernel_a1b0_tn.c
rename to kernel/generic/gemm_small_matrix_kernel_b0_tn.c
index c720f2ae3..f67035c72 100644
--- a/kernel/generic/gemm_small_matrix_kernel_a1b0_tn.c
+++ b/kernel/generic/gemm_small_matrix_kernel_b0_tn.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda,FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
 {
 	//naive implemtation
 	//Column major
@@ -41,7 +41,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda,FLOAT * B,
 			for(k=0; k<K; k++){
 				result += A[i*lda+k] * B[k+j*ldb];
 			}
-			C[i+j*ldc]=result;
+			C[i+j*ldc]=alpha * result;
 		}
 	}
 
diff --git a/kernel/generic/gemm_small_matrix_kernel_a1b0_tt.c b/kernel/generic/gemm_small_matrix_kernel_b0_tt.c
similarity index 95%
rename from kernel/generic/gemm_small_matrix_kernel_a1b0_tt.c
rename to kernel/generic/gemm_small_matrix_kernel_b0_tt.c
index acc6739f1..4e64e6b2a 100644
--- a/kernel/generic/gemm_small_matrix_kernel_a1b0_tt.c
+++ b/kernel/generic/gemm_small_matrix_kernel_b0_tt.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
 {
 	//naive implemtation
 	//Column major
@@ -41,7 +41,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B
 			for(k=0; k<K; k++){
 				result += A[i*lda+k] * B[k*ldb+j];
 			}
-			C[i+j*ldc]=result;
+			C[i+j*ldc]=alpha * result;
 		}
 	}
 

From 57ed58cefec3ca6669afc156cc90ffb49dba6593 Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Fri, 28 Aug 2020 21:00:54 +0800
Subject: [PATCH 026/143] Refs #2587 Add small matrix optimization reference
 kernel for c/zgemm.

---
 common_c.h                                    |  40 +++
 common_level3.h                               |  80 +++++
 common_macro.h                                |  80 +++++
 common_z.h                                    |  40 +++
 interface/gemm.c                              |  35 ++-
 kernel/Makefile.L3                            | 293 ++++++++++++++++++
 .../generic/zgemm_small_matrix_kernel_b0_nn.c |  74 +++++
 .../generic/zgemm_small_matrix_kernel_b0_nt.c |  77 +++++
 .../generic/zgemm_small_matrix_kernel_b0_tn.c |  77 +++++
 .../generic/zgemm_small_matrix_kernel_b0_tt.c |  77 +++++
 kernel/generic/zgemm_small_matrix_kernel_nn.c |  78 +++++
 kernel/generic/zgemm_small_matrix_kernel_nt.c |  82 +++++
 kernel/generic/zgemm_small_matrix_kernel_tn.c |  82 +++++
 kernel/generic/zgemm_small_matrix_kernel_tt.c |  82 +++++
 14 files changed, 1193 insertions(+), 4 deletions(-)
 create mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_nn.c
 create mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_nt.c
 create mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_tn.c
 create mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_tt.c
 create mode 100644 kernel/generic/zgemm_small_matrix_kernel_nn.c
 create mode 100644 kernel/generic/zgemm_small_matrix_kernel_nt.c
 create mode 100644 kernel/generic/zgemm_small_matrix_kernel_tn.c
 create mode 100644 kernel/generic/zgemm_small_matrix_kernel_tt.c

diff --git a/common_c.h b/common_c.h
index 40ecf5b8b..9388ece93 100644
--- a/common_c.h
+++ b/common_c.h
@@ -232,6 +232,46 @@
 
 #define CGEADD_K                cgeadd_k 
 
+#define CGEMM_SMALL_KERNEL_NN   cgemm_small_kernel_nn
+#define CGEMM_SMALL_KERNEL_NT   cgemm_small_kernel_nt
+#define CGEMM_SMALL_KERNEL_NR   cgemm_small_kernel_nr
+#define CGEMM_SMALL_KERNEL_NC   cgemm_small_kernel_nc
+
+#define CGEMM_SMALL_KERNEL_TN   cgemm_small_kernel_tn
+#define CGEMM_SMALL_KERNEL_TT   cgemm_small_kernel_tt
+#define CGEMM_SMALL_KERNEL_TR   cgemm_small_kernel_tr
+#define CGEMM_SMALL_KERNEL_TC   cgemm_small_kernel_tc
+
+#define CGEMM_SMALL_KERNEL_RN   cgemm_small_kernel_rn
+#define CGEMM_SMALL_KERNEL_RT   cgemm_small_kernel_rt
+#define CGEMM_SMALL_KERNEL_RR   cgemm_small_kernel_rr
+#define CGEMM_SMALL_KERNEL_RC   cgemm_small_kernel_rc
+
+#define CGEMM_SMALL_KERNEL_CN   cgemm_small_kernel_cn
+#define CGEMM_SMALL_KERNEL_CT   cgemm_small_kernel_ct
+#define CGEMM_SMALL_KERNEL_CR   cgemm_small_kernel_cr
+#define CGEMM_SMALL_KERNEL_CC   cgemm_small_kernel_cc
+
+#define CGEMM_SMALL_KERNEL_B0_NN   cgemm_small_kernel_b0_nn
+#define CGEMM_SMALL_KERNEL_B0_NT   cgemm_small_kernel_b0_nt
+#define CGEMM_SMALL_KERNEL_B0_NR   cgemm_small_kernel_b0_nr
+#define CGEMM_SMALL_KERNEL_B0_NC   cgemm_small_kernel_b0_nc
+
+#define CGEMM_SMALL_KERNEL_B0_TN   cgemm_small_kernel_b0_tn
+#define CGEMM_SMALL_KERNEL_B0_TT   cgemm_small_kernel_b0_tt
+#define CGEMM_SMALL_KERNEL_B0_TR   cgemm_small_kernel_b0_tr
+#define CGEMM_SMALL_KERNEL_B0_TC   cgemm_small_kernel_b0_tc
+
+#define CGEMM_SMALL_KERNEL_B0_RN   cgemm_small_kernel_b0_rn
+#define CGEMM_SMALL_KERNEL_B0_RT   cgemm_small_kernel_b0_rt
+#define CGEMM_SMALL_KERNEL_B0_RR   cgemm_small_kernel_b0_rr
+#define CGEMM_SMALL_KERNEL_B0_RC   cgemm_small_kernel_b0_rc
+
+#define CGEMM_SMALL_KERNEL_B0_CN   cgemm_small_kernel_b0_cn
+#define CGEMM_SMALL_KERNEL_B0_CT   cgemm_small_kernel_b0_ct
+#define CGEMM_SMALL_KERNEL_B0_CR   cgemm_small_kernel_b0_cr
+#define CGEMM_SMALL_KERNEL_B0_CC   cgemm_small_kernel_b0_cc
+
 #else
 
 #define	CAMAX_K			gotoblas -> camax_k
diff --git a/common_level3.h b/common_level3.h
index 7be7ab06b..5741f56d5 100644
--- a/common_level3.h
+++ b/common_level3.h
@@ -536,6 +536,86 @@ int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLA
 int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
 int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
 
+int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+	
+int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+
+int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+
+int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+
+int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+	
+int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+
+int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+
+int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+
+int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);	
+int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+	
+int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);	
+int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+	
+int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
 #endif
 
 int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float,  float,  float  *, float  *, float  *, BLASLONG);
diff --git a/common_macro.h b/common_macro.h
index fa7884180..2cccf9b39 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -2093,6 +2093,46 @@
 
 #define GEADD_K                 ZGEADD_K
 
+#define GEMM_SMALL_KERNEL_NN    ZGEMM_SMALL_KERNEL_NN
+#define GEMM_SMALL_KERNEL_NT    ZGEMM_SMALL_KERNEL_NT
+#define GEMM_SMALL_KERNEL_NR    ZGEMM_SMALL_KERNEL_NR
+#define GEMM_SMALL_KERNEL_NC    ZGEMM_SMALL_KERNEL_NC
+
+#define GEMM_SMALL_KERNEL_TN    ZGEMM_SMALL_KERNEL_TN
+#define GEMM_SMALL_KERNEL_TT    ZGEMM_SMALL_KERNEL_TT
+#define GEMM_SMALL_KERNEL_TR    ZGEMM_SMALL_KERNEL_TR
+#define GEMM_SMALL_KERNEL_TC    ZGEMM_SMALL_KERNEL_TC
+
+#define GEMM_SMALL_KERNEL_RN    ZGEMM_SMALL_KERNEL_RN
+#define GEMM_SMALL_KERNEL_RT    ZGEMM_SMALL_KERNEL_RT
+#define GEMM_SMALL_KERNEL_RR    ZGEMM_SMALL_KERNEL_RR
+#define GEMM_SMALL_KERNEL_RC    ZGEMM_SMALL_KERNEL_RC
+
+#define GEMM_SMALL_KERNEL_CN    ZGEMM_SMALL_KERNEL_CN
+#define GEMM_SMALL_KERNEL_CT    ZGEMM_SMALL_KERNEL_CT
+#define GEMM_SMALL_KERNEL_CR    ZGEMM_SMALL_KERNEL_CR
+#define GEMM_SMALL_KERNEL_CC    ZGEMM_SMALL_KERNEL_CC
+
+#define GEMM_SMALL_KERNEL_B0_NN    ZGEMM_SMALL_KERNEL_B0_NN
+#define GEMM_SMALL_KERNEL_B0_NT    ZGEMM_SMALL_KERNEL_B0_NT
+#define GEMM_SMALL_KERNEL_B0_NR    ZGEMM_SMALL_KERNEL_B0_NR
+#define GEMM_SMALL_KERNEL_B0_NC    ZGEMM_SMALL_KERNEL_B0_NC
+
+#define GEMM_SMALL_KERNEL_B0_TN    ZGEMM_SMALL_KERNEL_B0_TN
+#define GEMM_SMALL_KERNEL_B0_TT    ZGEMM_SMALL_KERNEL_B0_TT
+#define GEMM_SMALL_KERNEL_B0_TR    ZGEMM_SMALL_KERNEL_B0_TR
+#define GEMM_SMALL_KERNEL_B0_TC    ZGEMM_SMALL_KERNEL_B0_TC
+
+#define GEMM_SMALL_KERNEL_B0_RN    ZGEMM_SMALL_KERNEL_B0_RN
+#define GEMM_SMALL_KERNEL_B0_RT    ZGEMM_SMALL_KERNEL_B0_RT
+#define GEMM_SMALL_KERNEL_B0_RR    ZGEMM_SMALL_KERNEL_B0_RR
+#define GEMM_SMALL_KERNEL_B0_RC    ZGEMM_SMALL_KERNEL_B0_RC
+
+#define GEMM_SMALL_KERNEL_B0_CN    ZGEMM_SMALL_KERNEL_B0_CN
+#define GEMM_SMALL_KERNEL_B0_CT    ZGEMM_SMALL_KERNEL_B0_CT
+#define GEMM_SMALL_KERNEL_B0_CR    ZGEMM_SMALL_KERNEL_B0_CR
+#define GEMM_SMALL_KERNEL_B0_CC    ZGEMM_SMALL_KERNEL_B0_CC
+
 #else
 
 #define	AMAX_K			CAMAX_K
@@ -2516,6 +2556,46 @@
 
 #define GEADD_K                 CGEADD_K
 
+#define GEMM_SMALL_KERNEL_NN    CGEMM_SMALL_KERNEL_NN
+#define GEMM_SMALL_KERNEL_NT    CGEMM_SMALL_KERNEL_NT
+#define GEMM_SMALL_KERNEL_NR    CGEMM_SMALL_KERNEL_NR
+#define GEMM_SMALL_KERNEL_NC    CGEMM_SMALL_KERNEL_NC
+
+#define GEMM_SMALL_KERNEL_TN    CGEMM_SMALL_KERNEL_TN
+#define GEMM_SMALL_KERNEL_TT    CGEMM_SMALL_KERNEL_TT
+#define GEMM_SMALL_KERNEL_TR    CGEMM_SMALL_KERNEL_TR
+#define GEMM_SMALL_KERNEL_TC    CGEMM_SMALL_KERNEL_TC
+
+#define GEMM_SMALL_KERNEL_RN    CGEMM_SMALL_KERNEL_RN
+#define GEMM_SMALL_KERNEL_RT    CGEMM_SMALL_KERNEL_RT
+#define GEMM_SMALL_KERNEL_RR    CGEMM_SMALL_KERNEL_RR
+#define GEMM_SMALL_KERNEL_RC    CGEMM_SMALL_KERNEL_RC
+
+#define GEMM_SMALL_KERNEL_CN    CGEMM_SMALL_KERNEL_CN
+#define GEMM_SMALL_KERNEL_CT    CGEMM_SMALL_KERNEL_CT
+#define GEMM_SMALL_KERNEL_CR    CGEMM_SMALL_KERNEL_CR
+#define GEMM_SMALL_KERNEL_CC    CGEMM_SMALL_KERNEL_CC
+
+#define GEMM_SMALL_KERNEL_B0_NN    CGEMM_SMALL_KERNEL_B0_NN
+#define GEMM_SMALL_KERNEL_B0_NT    CGEMM_SMALL_KERNEL_B0_NT
+#define GEMM_SMALL_KERNEL_B0_NR    CGEMM_SMALL_KERNEL_B0_NR
+#define GEMM_SMALL_KERNEL_B0_NC    CGEMM_SMALL_KERNEL_B0_NC
+
+#define GEMM_SMALL_KERNEL_B0_TN    CGEMM_SMALL_KERNEL_B0_TN
+#define GEMM_SMALL_KERNEL_B0_TT    CGEMM_SMALL_KERNEL_B0_TT
+#define GEMM_SMALL_KERNEL_B0_TR    CGEMM_SMALL_KERNEL_B0_TR
+#define GEMM_SMALL_KERNEL_B0_TC    CGEMM_SMALL_KERNEL_B0_TC
+
+#define GEMM_SMALL_KERNEL_B0_RN    CGEMM_SMALL_KERNEL_B0_RN
+#define GEMM_SMALL_KERNEL_B0_RT    CGEMM_SMALL_KERNEL_B0_RT
+#define GEMM_SMALL_KERNEL_B0_RR    CGEMM_SMALL_KERNEL_B0_RR
+#define GEMM_SMALL_KERNEL_B0_RC    CGEMM_SMALL_KERNEL_B0_RC
+
+#define GEMM_SMALL_KERNEL_B0_CN    CGEMM_SMALL_KERNEL_B0_CN
+#define GEMM_SMALL_KERNEL_B0_CT    CGEMM_SMALL_KERNEL_B0_CT
+#define GEMM_SMALL_KERNEL_B0_CR    CGEMM_SMALL_KERNEL_B0_CR
+#define GEMM_SMALL_KERNEL_B0_CC    CGEMM_SMALL_KERNEL_B0_CC
+
 #endif
 #endif
 
diff --git a/common_z.h b/common_z.h
index f1e78dd08..8594ec74d 100644
--- a/common_z.h
+++ b/common_z.h
@@ -232,6 +232,46 @@
 
 #define ZGEADD_K                zgeadd_k 
 
+#define ZGEMM_SMALL_KERNEL_NN   zgemm_small_kernel_nn
+#define ZGEMM_SMALL_KERNEL_NT   zgemm_small_kernel_nt
+#define ZGEMM_SMALL_KERNEL_NR   zgemm_small_kernel_nr
+#define ZGEMM_SMALL_KERNEL_NC   zgemm_small_kernel_nc
+
+#define ZGEMM_SMALL_KERNEL_TN   zgemm_small_kernel_tn
+#define ZGEMM_SMALL_KERNEL_TT   zgemm_small_kernel_tt
+#define ZGEMM_SMALL_KERNEL_TR   zgemm_small_kernel_tr
+#define ZGEMM_SMALL_KERNEL_TC   zgemm_small_kernel_tc
+
+#define ZGEMM_SMALL_KERNEL_RN   zgemm_small_kernel_rn
+#define ZGEMM_SMALL_KERNEL_RT   zgemm_small_kernel_rt
+#define ZGEMM_SMALL_KERNEL_RR   zgemm_small_kernel_rr
+#define ZGEMM_SMALL_KERNEL_RC   zgemm_small_kernel_rc
+
+#define ZGEMM_SMALL_KERNEL_CN   zgemm_small_kernel_cn
+#define ZGEMM_SMALL_KERNEL_CT   zgemm_small_kernel_ct
+#define ZGEMM_SMALL_KERNEL_CR   zgemm_small_kernel_cr
+#define ZGEMM_SMALL_KERNEL_CC   zgemm_small_kernel_cc
+
+#define ZGEMM_SMALL_KERNEL_B0_NN   zgemm_small_kernel_b0_nn
+#define ZGEMM_SMALL_KERNEL_B0_NT   zgemm_small_kernel_b0_nt
+#define ZGEMM_SMALL_KERNEL_B0_NR   zgemm_small_kernel_b0_nr
+#define ZGEMM_SMALL_KERNEL_B0_NC   zgemm_small_kernel_b0_nc
+
+#define ZGEMM_SMALL_KERNEL_B0_TN   zgemm_small_kernel_b0_tn
+#define ZGEMM_SMALL_KERNEL_B0_TT   zgemm_small_kernel_b0_tt
+#define ZGEMM_SMALL_KERNEL_B0_TR   zgemm_small_kernel_b0_tr
+#define ZGEMM_SMALL_KERNEL_B0_TC   zgemm_small_kernel_b0_tc
+
+#define ZGEMM_SMALL_KERNEL_B0_RN   zgemm_small_kernel_b0_rn
+#define ZGEMM_SMALL_KERNEL_B0_RT   zgemm_small_kernel_b0_rt
+#define ZGEMM_SMALL_KERNEL_B0_RR   zgemm_small_kernel_b0_rr
+#define ZGEMM_SMALL_KERNEL_B0_RC   zgemm_small_kernel_b0_rc
+
+#define ZGEMM_SMALL_KERNEL_B0_CN   zgemm_small_kernel_b0_cn
+#define ZGEMM_SMALL_KERNEL_B0_CT   zgemm_small_kernel_b0_ct
+#define ZGEMM_SMALL_KERNEL_B0_CR   zgemm_small_kernel_b0_cr
+#define ZGEMM_SMALL_KERNEL_B0_CC   zgemm_small_kernel_b0_cc
+
 #else
 
 #define	ZAMAX_K			gotoblas -> zamax_k
diff --git a/interface/gemm.c b/interface/gemm.c
index 3730f37fa..b73baa9bd 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -124,6 +124,28 @@ static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLAS
 #endif
 #endif
 };
+
+static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *,FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG) = {
+#ifndef GEMM3M
+#ifdef COMPLEX
+	GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN,
+	GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT,
+	GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR,
+	GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC,
+#endif
+#endif
+};
+
+static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
+#ifndef GEMM3M
+#ifdef COMPLEX
+	GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN,
+	GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT,
+	GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR,
+	GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC,
+#endif
+#endif
+};
 #endif
 
 #ifndef CBLAS
@@ -446,20 +468,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 #endif
 
 #ifdef SMALL_MATRIX_OPT
-#if !defined(COMPLEX)
   //need to tune small matrices cases.
   if(MNK <= 100.0*100.0*100.0){
-
+	  
+#if !defined(COMPLEX)
 	  if(*(FLOAT *)(args.beta) == 0.0){
 		  (gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc);
 	  }else{
 		  (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
 	  }
-	  
+#else
+	  if(beta[0] == 0.0 && beta[1] == 0.0){
+		  (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc);
+	  }else{
+		  (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, (FLOAT *)(args.beta), args.c, args.ldc);
+	  }
+#endif	  
 	  return;
   }
 #endif
-#endif
   
 
   buffer = (XFLOAT *)blas_memory_alloc(0);
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index c9544086a..1c4a00158 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -462,6 +462,42 @@ DBLASOBJS += \
 	dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
 	dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
 
+CBLASOBJS += \
+	cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
+
+ZBLASOBJS += \
+	zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
+
 endif
 
 ######  BLAS extensions #####
@@ -4370,3 +4406,260 @@ $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL
 
 $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+
+ifndef CGEMM_SMALL_K_NN
+CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
+endif
+
+ifndef CGEMM_SMALL_K_NT
+CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
+endif
+
+ifndef CGEMM_SMALL_K_TN
+CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
+endif
+
+ifndef CGEMM_SMALL_K_TT
+CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
+	
+$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@
+
+$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@
+
+$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@
+
+$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@
+
+$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
+
+$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@
+
+$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@
+
+$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@
+
+$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@
+
+$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
+
+$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@
+
+$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@
+
+$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@
+
+$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@
+
+$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
+
+ifndef CGEMM_SMALL_K_B0_NN
+CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c
+endif
+
+ifndef CGEMM_SMALL_K_B0_NT
+CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c
+endif
+
+ifndef CGEMM_SMALL_K_B0_TN
+CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c
+endif
+
+ifndef CGEMM_SMALL_K_B0_TT
+CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c
+endif
+
+$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
+	
+$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
+
+ifndef ZGEMM_SMALL_K_NN
+ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
+endif
+
+ifndef ZGEMM_SMALL_K_NT
+ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
+endif
+
+ifndef ZGEMM_SMALL_K_TN
+ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
+endif
+
+ifndef ZGEMM_SMALL_K_TT
+ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
+	
+$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@
+
+$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@
+
+$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@
+
+$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@
+
+$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
+
+$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@
+
+$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@
+
+$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@
+
+$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@
+
+$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
+
+$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@
+
+$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@
+
+$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@
+
+$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@
+
+$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
+
+ifndef ZGEMM_SMALL_K_B0_NN
+ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c
+endif
+
+ifndef ZGEMM_SMALL_K_B0_NT
+ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c
+endif
+
+ifndef ZGEMM_SMALL_K_B0_TN
+ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c
+endif
+
+ifndef ZGEMM_SMALL_K_B0_TT
+ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c
+endif
+
+$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
+	
+$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c
new file mode 100644
index 000000000..11e746e52
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+{
+	FLOAT real, imag;
+
+	int i, j, l;
+	for(i = 0; i < M; i++){
+		for(j = 0; j < N; j++){
+			real=0;
+			imag=0;
+
+			for(l = 0; l < K; l++){
+#if defined(NN)
+				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]      
+					 -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]        
+				       + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
+#elif defined(NR)
+				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]      
+					 +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]      
+				       + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
+#elif defined(RN)
+				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
+					 +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
+				       - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
+#elif defined(RR)
+				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
+					 -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
+				       - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
+#endif
+			}
+
+			C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1];
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c
new file mode 100644
index 000000000..1ef743017
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c
@@ -0,0 +1,77 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+{
+	FLOAT real, imag;
+	int i, j, l;
+	for(i = 0; i < M; i++){
+		for(j = 0; j < N; j++){
+			real=0;
+			imag=0;
+
+			for(l = 0; l < K; l++){
+#if defined(NT)
+				real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+					 -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]    
+				       + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(NC)
+		                real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]          
+					 +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
+				       + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(RT)
+		                real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+					 +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]    
+				       - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(RC)
+		                real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+					 -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
+				       - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
+
+#endif
+			}
+
+			C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1];
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c
new file mode 100644
index 000000000..2cd3ebcf2
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c
@@ -0,0 +1,77 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+{
+	FLOAT real, imag;
+	int i, j, l;
+	for(i = 0; i < M; i++){
+		for(j = 0; j < N; j++){
+			real=0;
+			imag=0;
+
+			for(l = 0; l < K; l++){
+#if defined(TN)
+				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+					 -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+				       + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
+
+#elif defined(TR)
+				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+					 +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+				       + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
+
+#elif defined(CN)
+				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+					 +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+				       - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
+
+#elif defined(CR)
+				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+					 -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+				       - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
+
+#endif
+			}
+
+			C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1];
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c
new file mode 100644
index 000000000..25b05b4aa
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c
@@ -0,0 +1,77 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+{
+	FLOAT real, imag;
+	int i, j, l;
+	for(i = 0; i < M; i++){
+		for(j = 0; j < N; j++){
+			real=0;
+			imag=0;
+
+			for(l = 0; l < K; l++){
+#if defined(TT)
+				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+					 -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+				       + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(TC)
+				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+					 +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+				       + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(CT)
+				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+					 +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+				       - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(CC)
+				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+					 -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+				       - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
+
+#endif
+			}
+
+			C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1];
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_nn.c b/kernel/generic/zgemm_small_matrix_kernel_nn.c
new file mode 100644
index 000000000..6ef1b9655
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_kernel_nn.c
@@ -0,0 +1,78 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc)
+{
+	FLOAT real, imag;
+	FLOAT tmp0, tmp1;
+	int i, j, l;
+	for(i = 0; i < M; i++){
+		for(j = 0; j < N; j++){
+			real=0;
+			imag=0;
+
+			for(l = 0; l < K; l++){
+#if defined(NN)
+				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]      
+					 -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]        
+				       + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
+#elif defined(NR)
+				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]      
+					 +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]      
+				       + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
+#elif defined(RN)
+				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
+					 +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
+				       - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
+#elif defined(RR)
+				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
+					 -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
+				       - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
+#endif
+			}
+
+			tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1];
+			tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i];
+
+
+			C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag;
+			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1];
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_nt.c b/kernel/generic/zgemm_small_matrix_kernel_nt.c
new file mode 100644
index 000000000..3c81ad79e
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_kernel_nt.c
@@ -0,0 +1,82 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc)
+{
+	FLOAT real, imag;
+	FLOAT tmp0, tmp1;
+	int i, j, l;
+	for(i = 0; i < M; i++){
+		for(j = 0; j < N; j++){
+			real=0;
+			imag=0;
+
+			for(l = 0; l < K; l++){
+#if defined(NT)
+				real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+					 -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]    
+				       + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(NC)
+		                real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]          
+					 +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
+				       + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(RT)
+		                real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+					 +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]    
+				       - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(RC)
+		                real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+					 -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
+				       - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
+
+#endif
+			}
+
+			tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1];
+			tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i];
+
+
+			C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag;
+			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1];
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_tn.c b/kernel/generic/zgemm_small_matrix_kernel_tn.c
new file mode 100644
index 000000000..143190bb1
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_kernel_tn.c
@@ -0,0 +1,82 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc)
+{
+	FLOAT real, imag;
+	FLOAT tmp0, tmp1;
+	int i, j, l;
+	for(i = 0; i < M; i++){
+		for(j = 0; j < N; j++){
+			real=0;
+			imag=0;
+
+			for(l = 0; l < K; l++){
+#if defined(TN)
+				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+					 -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+				       + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
+
+#elif defined(TR)
+				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+					 +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+				       + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
+
+#elif defined(CN)
+				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+					 +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+				       - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
+
+#elif defined(CR)
+				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+					 -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+				       - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
+
+#endif
+			}
+
+			tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1];
+			tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i];
+
+
+			C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag;
+			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1];
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_tt.c b/kernel/generic/zgemm_small_matrix_kernel_tt.c
new file mode 100644
index 000000000..246e26e84
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_kernel_tt.c
@@ -0,0 +1,82 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc)
+{
+	FLOAT real, imag;
+	FLOAT tmp0, tmp1;
+	int i, j, l;
+	for(i = 0; i < M; i++){
+		for(j = 0; j < N; j++){
+			real=0;
+			imag=0;
+
+			for(l = 0; l < K; l++){
+#if defined(TT)
+				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+					 -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+				       + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(TC)
+				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+					 +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+				       + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(CT)
+				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+					 +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+				       - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(CC)
+				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+					 -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+				       - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
+
+#endif
+			}
+
+			tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1];
+			tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i];
+
+
+			C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag;
+			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1];
+		}
+	}
+	
+	return 0;
+}

From 6022e5629c7708b114a3c2387e652ebd32122300 Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Fri, 28 Aug 2020 22:36:36 +0800
Subject: [PATCH 027/143] Refs #2587 fix small matrix c/zgemm bug.

---
 common_level3.h                               | 150 +++++++++---------
 interface/gemm.c                              |  22 ++-
 .../generic/zgemm_small_matrix_kernel_b0_nn.c |   6 +-
 .../generic/zgemm_small_matrix_kernel_b0_nt.c |   6 +-
 .../generic/zgemm_small_matrix_kernel_b0_tn.c |   6 +-
 .../generic/zgemm_small_matrix_kernel_b0_tt.c |   6 +-
 kernel/generic/zgemm_small_matrix_kernel_nn.c |  10 +-
 kernel/generic/zgemm_small_matrix_kernel_nt.c |  10 +-
 kernel/generic/zgemm_small_matrix_kernel_tn.c |  10 +-
 kernel/generic/zgemm_small_matrix_kernel_tt.c |  10 +-
 10 files changed, 116 insertions(+), 120 deletions(-)

diff --git a/common_level3.h b/common_level3.h
index 5741f56d5..a3a487dab 100644
--- a/common_level3.h
+++ b/common_level3.h
@@ -536,85 +536,85 @@ int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLA
 int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
 int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
 
-int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
+int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
 	
-int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-
-int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-
-int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc);
-
-int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
+int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
 	
-int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-
-int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-
-int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc);
-
-int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);	
-int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);	
+int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
 	
-int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-
-int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-
-int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
-
-int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);	
-int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);	
+int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
 	
-int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-
-int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-
-int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
-int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
 
 #endif
 
diff --git a/interface/gemm.c b/interface/gemm.c
index b73baa9bd..7251993ee 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -106,47 +106,43 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
 };
 
 #ifdef SMALL_MATRIX_OPT
-//Only support s/dgemm small matrix optimiztion so far.
+
+#ifndef COMPLEX
 static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
 #ifndef GEMM3M
-#ifndef COMPLEX
 	GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL,
 	GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL,
 #endif
-#endif
 };
 
 static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
 #ifndef GEMM3M
-#ifndef COMPLEX
 	GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, NULL, NULL,
 	GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, NULL, NULL,
 #endif
-#endif
 };
 
-static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *,FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG) = {
+#else
+
+static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG) = {
 #ifndef GEMM3M
-#ifdef COMPLEX
 	GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN,
 	GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT,
 	GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR,
 	GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC,
 #endif
-#endif
 };
 
-static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
+static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
 #ifndef GEMM3M
-#ifdef COMPLEX
 	GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN,
 	GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT,
 	GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR,
 	GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC,
 #endif
-#endif
 };
 #endif
+#endif
 
 #ifndef CBLAS
 
@@ -479,9 +475,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 	  }
 #else
 	  if(beta[0] == 0.0 && beta[1] == 0.0){
-		  (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc);
+		  (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc);
 	  }else{
-		  (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, (FLOAT *)(args.beta), args.c, args.ldc);
+		  (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc);
 	  }
 #endif	  
 	  return;
diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c
index 11e746e52..3ab057fef 100644
--- a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c
+++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
 {
 	FLOAT real, imag;
 
@@ -65,8 +65,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al
 #endif
 			}
 
-			C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag;
-			C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1];
+			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
 		}
 	}
 	
diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c
index 1ef743017..dc35f4a6d 100644
--- a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c
+++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
 {
 	FLOAT real, imag;
 	int i, j, l;
@@ -68,8 +68,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al
 #endif
 			}
 
-			C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag;
-			C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1];
+			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
 		}
 	}
 	
diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c
index 2cd3ebcf2..479a56e8f 100644
--- a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c
+++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
 {
 	FLOAT real, imag;
 	int i, j, l;
@@ -68,8 +68,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al
 #endif
 			}
 
-			C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag;
-			C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1];
+			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
 		}
 	}
 	
diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c
index 25b05b4aa..b698973dd 100644
--- a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c
+++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
 {
 	FLOAT real, imag;
 	int i, j, l;
@@ -68,8 +68,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al
 #endif
 			}
 
-			C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag;
-			C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1];
+			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
 		}
 	}
 	
diff --git a/kernel/generic/zgemm_small_matrix_kernel_nn.c b/kernel/generic/zgemm_small_matrix_kernel_nn.c
index 6ef1b9655..4bf6bf7ee 100644
--- a/kernel/generic/zgemm_small_matrix_kernel_nn.c
+++ b/kernel/generic/zgemm_small_matrix_kernel_nn.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
 {
 	FLOAT real, imag;
 	FLOAT tmp0, tmp1;
@@ -65,12 +65,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al
 #endif
 			}
 
-			tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1];
-			tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i];
+			tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
+			tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
 
 
-			C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag;
-			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1];
+			C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
 		}
 	}
 	
diff --git a/kernel/generic/zgemm_small_matrix_kernel_nt.c b/kernel/generic/zgemm_small_matrix_kernel_nt.c
index 3c81ad79e..288e49c13 100644
--- a/kernel/generic/zgemm_small_matrix_kernel_nt.c
+++ b/kernel/generic/zgemm_small_matrix_kernel_nt.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
 {
 	FLOAT real, imag;
 	FLOAT tmp0, tmp1;
@@ -69,12 +69,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al
 #endif
 			}
 
-			tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1];
-			tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i];
+			tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
+			tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
 
 
-			C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag;
-			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1];
+			C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
 		}
 	}
 	
diff --git a/kernel/generic/zgemm_small_matrix_kernel_tn.c b/kernel/generic/zgemm_small_matrix_kernel_tn.c
index 143190bb1..1e2a5aed4 100644
--- a/kernel/generic/zgemm_small_matrix_kernel_tn.c
+++ b/kernel/generic/zgemm_small_matrix_kernel_tn.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
 {
 	FLOAT real, imag;
 	FLOAT tmp0, tmp1;
@@ -69,12 +69,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al
 #endif
 			}
 
-			tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1];
-			tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i];
+			tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
+			tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
 
 
-			C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag;
-			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1];
+			C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
 		}
 	}
 	
diff --git a/kernel/generic/zgemm_small_matrix_kernel_tt.c b/kernel/generic/zgemm_small_matrix_kernel_tt.c
index 246e26e84..180043539 100644
--- a/kernel/generic/zgemm_small_matrix_kernel_tt.c
+++ b/kernel/generic/zgemm_small_matrix_kernel_tt.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
 {
 	FLOAT real, imag;
 	FLOAT tmp0, tmp1;
@@ -69,12 +69,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al
 #endif
 			}
 
-			tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1];
-			tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i];
+			tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
+			tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
 
 
-			C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag;
-			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1];
+			C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
 		}
 	}
 	

From 9186456a1297f7ee97bae56370c404114933a5ee Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Sat, 8 May 2021 10:45:10 +0000
Subject: [PATCH 028/143] small matrix: SkylakeX: add SGEMM NN kernel

---
 kernel/x86_64/KERNEL.SKYLAKEX                 |   2 +
 .../sgemm_small_kernel_b0_nn_skylakex.c       |   2 +
 .../x86_64/sgemm_small_kernel_nn_skylakex.c   | 424 ++++++++++++++++++
 3 files changed, 428 insertions(+)
 create mode 100644 kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c
 create mode 100644 kernel/x86_64/sgemm_small_kernel_nn_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 3d71584fe..1a2e67b52 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -10,6 +10,8 @@ STRSMKERNEL_LN =  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT =  ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT =  ../generic/trsm_kernel_RT.c
+SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c
+SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c
 
 DGEMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
 DTRMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
diff --git a/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c
new file mode 100644
index 000000000..704e964b8
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c
@@ -0,0 +1,2 @@
+#define B0 1
+#include "./sgemm_small_kernel_nn_skylakex.c"
diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
new file mode 100644
index 000000000..f2c79873e
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -0,0 +1,424 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+
+#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps()
+#define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)])
+#define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)]))
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N)
+#if defined(B0)
+#define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+			_mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N)
+#else
+#define STORE_512(M, N) \
+	BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \
+	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+	asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512)); \
+	_mm512_storeu_ps(&C[offset##M##N], result##M##N)
+#endif
+
+#define DECLARE_RESULT_256(M, N) __m256 result##M##N = _mm256_setzero_ps()
+#define LOAD_A_256(M, N) __m256 Aval##M = _mm256_loadu_ps(&A[lda * k + i + (M*8)])
+#define BROADCAST_LOAD_B_256(M, N) __m256 Bval##N = _mm256_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)]))
+#define MATMUL_256(M, N) result##M##N = _mm256_fmadd_ps(Aval##M, Bval##N, result##M##N)
+#if defined(B0)
+#define STORE_256(M, N) result##M##N = _mm256_mul_ps(result##M##N, alpha_256); \
+			_mm256_storeu_ps(&C[(j+N)*ldc + i + (M*8)], result##M##N)
+#else
+#define STORE_256(M, N) \
+	BLASLONG offset##M##N = (j+N)*ldc + i + (M*8); \
+	result##M##N = _mm256_mul_ps(result##M##N, alpha_256); \
+	asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_256)); \
+	_mm256_storeu_ps(&C[offset##M##N], result##M##N)
+#endif
+
+#define DECLARE_RESULT_128(M, N) __m128 result##M##N; asm("vpxorq %0, %0, %0": "+v"(result##M##N):)
+#define LOAD_A_128(M, N) __m128 Aval##M = _mm_maskz_loadu_ps(mask, &A[lda * k + i + (M*4)])
+#define BROADCAST_LOAD_B_128(M, N) __m128 Bval##N = _mm_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)]))
+#define MATMUL_128(M, N) result##M##N = _mm_fmadd_ps(Aval##M, Bval##N, result##M##N)
+#if defined(B0)
+#define STORE_128(M, N) result##M##N = _mm_maskz_mul_ps(mask, result##M##N, alpha_128); \
+			_mm_mask_storeu_ps(&C[(j+N)*ldc + i + (M*4)], mask, result##M##N)
+#else
+#define STORE_128(M, N) \
+	BLASLONG offset##M##N = (j+N)*ldc + i + (M*4); \
+	result##M##N = _mm_maskz_mul_ps(mask, result##M##N, alpha_128); \
+	asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_128)); \
+	_mm_mask_storeu_ps(&C[offset##M##N], mask, result##M##N)
+#endif
+
+#define DECLARE_RESULT_S(M, N) float result##M##N = 0;
+#define LOAD_A_S(M, N) float Aval##M = A[lda * k + i + M]
+#define BROADCAST_LOAD_B_S(M, N) float Bval##N = B[k + ldb * (j+N)]
+#define MATMUL_S(M, N) result##M##N += Aval##M * Bval##N
+#if defined(B0)
+#define STORE_S(M, N) C[(j+N)*ldc + i + M] = result##M##N * alpha
+#else
+#define STORE_S(M, N) C[(j+N)*ldc + i + M] = result##M##N * alpha + C[(j+N)*ldc + i + M] * beta
+#endif
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m64 = M & ~63;
+	BLASLONG m32 = M & ~31;
+	BLASLONG m16 = M & ~15;
+	BLASLONG m8 = M & ~7;
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n4 = N & ~3;
+	BLASLONG n2 = N & ~1;
+
+	__mmask8 mask = 0xff;  // just use to avoid SSE instruction
+
+	__m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha));
+#if !defined(B0)
+	__m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta));
+#endif
+
+	for (i = 0; i < m64; i += 64) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+			STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2);
+			STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+		}
+	}
+	for (; i < m32; i += 32) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+		}
+	}
+	for (; i < m16; i += 16) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			STORE_512(0, 0);
+		}
+	}
+	__m256 alpha_256 = _mm256_broadcastss_ps(_mm_load_ss(&alpha));
+#if !defined(B0)
+	__m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta));
+#endif
+	for (; i < m8; i += 8) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_256(0, 0);
+			DECLARE_RESULT_256(0, 1);
+			DECLARE_RESULT_256(0, 2);
+			DECLARE_RESULT_256(0, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_256(0, x);
+				BROADCAST_LOAD_B_256(x, 0); BROADCAST_LOAD_B_256(x, 1);
+				BROADCAST_LOAD_B_256(x, 2); BROADCAST_LOAD_B_256(x, 3);
+
+				MATMUL_256(0, 0);
+				MATMUL_256(0, 1);
+				MATMUL_256(0, 2);
+				MATMUL_256(0, 3);
+			}
+			STORE_256(0, 0);
+			STORE_256(0, 1);
+			STORE_256(0, 2);
+			STORE_256(0, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_256(0, 0);
+			DECLARE_RESULT_256(0, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_256(0, x);
+				BROADCAST_LOAD_B_256(x, 0); BROADCAST_LOAD_B_256(x, 1);
+				MATMUL_256(0, 0);
+				MATMUL_256(0, 1);
+			}
+			STORE_256(0, 0);
+			STORE_256(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_256(0, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_256(0, x);
+				BROADCAST_LOAD_B_256(x, 0);
+				MATMUL_256(0, 0);
+			}
+			STORE_256(0, 0);
+		}
+	}
+	__m128 alpha_128 = _mm_broadcastss_ps(_mm_load_ss(&alpha));
+#if !defined(B0)
+	__m128 beta_128 = _mm_broadcastss_ps(_mm_load_ss(&beta));
+#endif
+	for (; i < m4; i += 4) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_128(0, 0);
+			DECLARE_RESULT_128(0, 1);
+			DECLARE_RESULT_128(0, 2);
+			DECLARE_RESULT_128(0, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_128(0, x);
+				BROADCAST_LOAD_B_128(x, 0); BROADCAST_LOAD_B_128(x, 1);
+				BROADCAST_LOAD_B_128(x, 2); BROADCAST_LOAD_B_128(x, 3);
+
+				MATMUL_128(0, 0);
+				MATMUL_128(0, 1);
+				MATMUL_128(0, 2);
+				MATMUL_128(0, 3);
+			}
+			STORE_128(0, 0);
+			STORE_128(0, 1);
+			STORE_128(0, 2);
+			STORE_128(0, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_128(0, 0);
+			DECLARE_RESULT_128(0, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_128(0, x);
+				BROADCAST_LOAD_B_128(x, 0); BROADCAST_LOAD_B_128(x, 1);
+				MATMUL_128(0, 0);
+				MATMUL_128(0, 1);
+			}
+			STORE_128(0, 0);
+			STORE_128(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_128(0, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_128(0, x);
+				BROADCAST_LOAD_B_128(x, 0);
+				MATMUL_128(0, 0);
+			}
+			STORE_128(0, 0);
+		}
+	}
+	for (; i < m2; i += 2) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0);
+			DECLARE_RESULT_S(0, 1); DECLARE_RESULT_S(1, 1);
+			DECLARE_RESULT_S(0, 2); DECLARE_RESULT_S(1, 2);
+			DECLARE_RESULT_S(0, 3); DECLARE_RESULT_S(1, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_S(0, x); LOAD_A_S(1, x);
+				BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1);
+				BROADCAST_LOAD_B_S(x, 2); BROADCAST_LOAD_B_S(x, 3);
+
+				MATMUL_S(0, 0); MATMUL_S(1, 0);
+				MATMUL_S(0, 1); MATMUL_S(1, 1);
+				MATMUL_S(0, 2); MATMUL_S(1, 2);
+				MATMUL_S(0, 3); MATMUL_S(1, 3);
+			}
+			STORE_S(0, 0); STORE_S(1, 0);
+			STORE_S(0, 1); STORE_S(1, 1);
+			STORE_S(0, 2); STORE_S(1, 2);
+			STORE_S(0, 3); STORE_S(1, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0);
+			DECLARE_RESULT_S(0, 1); DECLARE_RESULT_S(1, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_S(0, x); LOAD_A_S(1, x);
+				BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1);
+				MATMUL_S(0, 0); MATMUL_S(1, 0);
+				MATMUL_S(0, 1); MATMUL_S(1, 1);
+			}
+			STORE_S(0, 0); STORE_S(1, 0);
+			STORE_S(0, 1); STORE_S(1, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_S(0, x); LOAD_A_S(1, x);
+				BROADCAST_LOAD_B_S(x, 0);
+				MATMUL_S(0, 0); MATMUL_S(1, 0);
+			}
+			STORE_S(0, 0); STORE_S(1, 0);
+		}
+	}
+	for (; i < M; i += 1) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_S(0, 0);
+			DECLARE_RESULT_S(0, 1);
+			DECLARE_RESULT_S(0, 2);
+			DECLARE_RESULT_S(0, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_S(0, x);
+				BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1);
+				BROADCAST_LOAD_B_S(x, 2); BROADCAST_LOAD_B_S(x, 3);
+
+				MATMUL_S(0, 0);
+				MATMUL_S(0, 1);
+				MATMUL_S(0, 2);
+				MATMUL_S(0, 3);
+			}
+			STORE_S(0, 0);
+			STORE_S(0, 1);
+			STORE_S(0, 2);
+			STORE_S(0, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_S(0, 0);
+			DECLARE_RESULT_S(0, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_S(0, x);
+				BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1);
+				MATMUL_S(0, 0);
+				MATMUL_S(0, 1);
+			}
+			STORE_S(0, 0);
+			STORE_S(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_S(0, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_S(0, x); LOAD_A_S(1, x);
+				BROADCAST_LOAD_B_S(x, 0);
+				MATMUL_S(0, 0);
+			}
+			STORE_S(0, 0);
+		}
+	}
+}

From f88470323bdb72a1e3ac54717606810699319d3b Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Sat, 8 May 2021 15:59:14 +0000
Subject: [PATCH 029/143] Optimize M < 16 using AVX512 mask

---
 .../x86_64/sgemm_small_kernel_nn_skylakex.c   | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
index f2c79873e..f0b6d63a6 100644
--- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -31,17 +31,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps()
 #define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)])
+#define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)])
 #define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)]))
 #define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N)
 #if defined(B0)
 #define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
 			_mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N)
+#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+			_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
 #else
 #define STORE_512(M, N) \
 	BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \
 	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
 	asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512)); \
 	_mm512_storeu_ps(&C[offset##M##N], result##M##N)
+#define MASK_STORE_512(M, N) \
+	BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \
+	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+	asm("vfmadd231ps (%1, %2, 4), %3, %0 %{%4%}": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512), "k"(mask)); \
+	_mm512_mask_storeu_ps(&C[offset##M##N], mask, result##M##N)
 #endif
 
 #define DECLARE_RESULT_256(M, N) __m256 result##M##N = _mm256_setzero_ps()
@@ -241,6 +249,51 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			STORE_512(0, 0);
 		}
 	}
+	if (M - i > 0) {
+		register __mmask16 mask asm("k1") = (1UL << (M - i)) - 1;
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			MASK_STORE_512(0, 0);
+		}
+		return;
+	}
 	__m256 alpha_256 = _mm256_broadcastss_ps(_mm_load_ss(&alpha));
 #if !defined(B0)
 	__m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta));

From 49b61a3f3027e24f19e78e573e50c86432aec574 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 11 May 2021 10:24:10 +0000
Subject: [PATCH 030/143] Small Matrix: skylakex: sgemm_nn: optimize for M <= 8

---
 .../x86_64/sgemm_small_kernel_nn_skylakex.c   | 302 +++++++++++++++++-
 1 file changed, 301 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
index f0b6d63a6..ae4a9daa3 100644
--- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <immintrin.h>
 #include "common.h"
 #include <stdio.h>
+#include <memory.h>
 
 #define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps()
 #define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)])
@@ -52,6 +53,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	_mm512_mask_storeu_ps(&C[offset##M##N], mask, result##M##N)
 #endif
 
+#define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&mbuf[(mi + M)*K + k]);
+#define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k])
+#define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &mbuf[(mi + M)*K + k])
+#define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k])
+#if defined(B0)
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N);
+#else
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M];
+#endif
+
+
+
 #define DECLARE_RESULT_256(M, N) __m256 result##M##N = _mm256_setzero_ps()
 #define LOAD_A_256(M, N) __m256 Aval##M = _mm256_loadu_ps(&A[lda * k + i + (M*8)])
 #define BROADCAST_LOAD_B_256(M, N) __m256 Bval##N = _mm256_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)]))
@@ -249,7 +262,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			STORE_512(0, 0);
 		}
 	}
-	if (M - i > 0) {
+	if (M - i > 8) {
 		register __mmask16 mask asm("k1") = (1UL << (M - i)) - 1;
 		for (j = 0; j < n4; j += 4) {
 			DECLARE_RESULT_512(0, 0);
@@ -294,6 +307,293 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 		}
 		return;
 	}
+	int mm = M - i;
+	if (mm) {
+		FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K);
+		__mmask8 mask8 = (1UL << mm) - 1;
+		__mmask16 mask;
+		BLASLONG k16 = K & ~15;
+		BLASLONG k8 = K & ~7;
+		for (k = 0; k < k8; k += 8) {
+			__m256  r0, r1, r2, r3, r4, r5, r6, r7;
+			__m256  t0, t1, t2, t3, t4, t5, t6, t7;
+			r0 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(0 + k)]);
+			r1 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(1 + k)]);
+			r2 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(2 + k)]);
+			r3 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(3 + k)]);
+			r4 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(4 + k)]);
+			r5 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(5 + k)]);
+			r6 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(6 + k)]);
+			r7 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(7 + k)]);
+
+			t0 = _mm256_unpacklo_ps(r0, r1);
+			t1 = _mm256_unpackhi_ps(r0, r1);
+			t2 = _mm256_unpacklo_ps(r2, r3);
+			t3 = _mm256_unpackhi_ps(r2, r3);
+			t4 = _mm256_unpacklo_ps(r4, r5);
+			t5 = _mm256_unpackhi_ps(r4, r5);
+			t6 = _mm256_unpacklo_ps(r6, r7);
+			t7 = _mm256_unpackhi_ps(r6, r7);
+
+			r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0));
+			r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2));
+			r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));
+			r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));
+			r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0));
+			r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2));
+			r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0));
+			r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2));
+
+			t0 = _mm256_permute2f128_ps(r0, r4, 0x20);
+			t1 = _mm256_permute2f128_ps(r1, r5, 0x20);
+			t2 = _mm256_permute2f128_ps(r2, r6, 0x20);
+			t3 = _mm256_permute2f128_ps(r3, r7, 0x20);
+			t4 = _mm256_permute2f128_ps(r0, r4, 0x31);
+			t5 = _mm256_permute2f128_ps(r1, r5, 0x31);
+			t6 = _mm256_permute2f128_ps(r2, r6, 0x31);
+			t7 = _mm256_permute2f128_ps(r3, r7, 0x31);
+
+			switch (mm) {
+				case 8: _mm256_storeu_ps(&mbuf[k + 7*K], t7);
+				case 7: _mm256_storeu_ps(&mbuf[k + 6*K], t6);
+				case 6: _mm256_storeu_ps(&mbuf[k + 5*K], t5);
+				case 5: _mm256_storeu_ps(&mbuf[k + 4*K], t4);
+				case 4: _mm256_storeu_ps(&mbuf[k + 3*K], t3);
+				case 3: _mm256_storeu_ps(&mbuf[k + 2*K], t2);
+				case 2: _mm256_storeu_ps(&mbuf[k + 1*K], t1);
+				case 1: _mm256_storeu_ps(&mbuf[k + 0*K], t0);
+			}
+		}
+		for (; k < K; k++) {
+			for (int ii = 0; ii < mm; ii++) {
+				mbuf[k + ii*K] = A[i + lda*k + ii];
+			}
+		}
+		int mi = 0;
+		for (; i < m4; i += 4, mi += 4) {
+			for (j = 0; j < n4; j += 4) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				}
+				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0);
+				STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); STORE_REDUCE(2, 1); STORE_REDUCE(3, 1);
+				STORE_REDUCE(0, 2); STORE_REDUCE(1, 2); STORE_REDUCE(2, 2); STORE_REDUCE(3, 2);
+				STORE_REDUCE(0, 3); STORE_REDUCE(1, 3); STORE_REDUCE(2, 3); STORE_REDUCE(3, 3);
+			}
+			for (; j < n2; j += 2) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				}
+				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0);
+				STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); STORE_REDUCE(2, 1); STORE_REDUCE(3, 1);
+
+			}
+			for (; j < N; j += 1) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+					LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+					MASK_LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				}
+				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0);
+			}
+
+		}
+		for (; i < m2; i += 2, mi += 2) {
+			for (j = 0; j < n4; j += 4) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+				STORE_REDUCE(0, 1); STORE_REDUCE(1, 1);
+				STORE_REDUCE(0, 2); STORE_REDUCE(1, 2);
+				STORE_REDUCE(0, 3); STORE_REDUCE(1, 3);
+			}
+			for (; j < n2; j += 2) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+				STORE_REDUCE(0, 1); STORE_REDUCE(1, 1);
+
+			}
+			for (; j < N; j += 1) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+					LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+					MASK_LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+			}
+		}
+		for (; i < M; i += 1, mi += 1) {
+			for (j = 0; j < n4; j += 4) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				DECLARE_RESULT_512(0, 2);
+				DECLARE_RESULT_512(0, 3);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				STORE_REDUCE(0, 0);
+				STORE_REDUCE(0, 1);
+				STORE_REDUCE(0, 2);
+				STORE_REDUCE(0, 3);
+			}
+			for (; j < n2; j += 2) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				STORE_REDUCE(0, 0);
+				STORE_REDUCE(0, 1);
+
+			}
+			for (; j < N; j += 1) {
+				DECLARE_RESULT_512(0, 0);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x);
+					LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x);
+					MASK_LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0);
+				}
+				STORE_REDUCE(0, 0);
+			}
+		}
+		free(mbuf);
+		return;
+	}
 	__m256 alpha_256 = _mm256_broadcastss_ps(_mm_load_ss(&alpha));
 #if !defined(B0)
 	__m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta));

From 3d8c6d9607c82a999ad8661834d0d78605a5f321 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 11 May 2021 10:33:07 +0000
Subject: [PATCH 031/143] Small Matrix: skylakex: sgemm nn: clean up unused
 code

---
 .../x86_64/sgemm_small_kernel_nn_skylakex.c   | 222 ------------------
 1 file changed, 222 deletions(-)

diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
index ae4a9daa3..a5c530593 100644
--- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -63,48 +63,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M];
 #endif
 
-
-
-#define DECLARE_RESULT_256(M, N) __m256 result##M##N = _mm256_setzero_ps()
-#define LOAD_A_256(M, N) __m256 Aval##M = _mm256_loadu_ps(&A[lda * k + i + (M*8)])
-#define BROADCAST_LOAD_B_256(M, N) __m256 Bval##N = _mm256_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)]))
-#define MATMUL_256(M, N) result##M##N = _mm256_fmadd_ps(Aval##M, Bval##N, result##M##N)
-#if defined(B0)
-#define STORE_256(M, N) result##M##N = _mm256_mul_ps(result##M##N, alpha_256); \
-			_mm256_storeu_ps(&C[(j+N)*ldc + i + (M*8)], result##M##N)
-#else
-#define STORE_256(M, N) \
-	BLASLONG offset##M##N = (j+N)*ldc + i + (M*8); \
-	result##M##N = _mm256_mul_ps(result##M##N, alpha_256); \
-	asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_256)); \
-	_mm256_storeu_ps(&C[offset##M##N], result##M##N)
-#endif
-
-#define DECLARE_RESULT_128(M, N) __m128 result##M##N; asm("vpxorq %0, %0, %0": "+v"(result##M##N):)
-#define LOAD_A_128(M, N) __m128 Aval##M = _mm_maskz_loadu_ps(mask, &A[lda * k + i + (M*4)])
-#define BROADCAST_LOAD_B_128(M, N) __m128 Bval##N = _mm_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)]))
-#define MATMUL_128(M, N) result##M##N = _mm_fmadd_ps(Aval##M, Bval##N, result##M##N)
-#if defined(B0)
-#define STORE_128(M, N) result##M##N = _mm_maskz_mul_ps(mask, result##M##N, alpha_128); \
-			_mm_mask_storeu_ps(&C[(j+N)*ldc + i + (M*4)], mask, result##M##N)
-#else
-#define STORE_128(M, N) \
-	BLASLONG offset##M##N = (j+N)*ldc + i + (M*4); \
-	result##M##N = _mm_maskz_mul_ps(mask, result##M##N, alpha_128); \
-	asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_128)); \
-	_mm_mask_storeu_ps(&C[offset##M##N], mask, result##M##N)
-#endif
-
-#define DECLARE_RESULT_S(M, N) float result##M##N = 0;
-#define LOAD_A_S(M, N) float Aval##M = A[lda * k + i + M]
-#define BROADCAST_LOAD_B_S(M, N) float Bval##N = B[k + ldb * (j+N)]
-#define MATMUL_S(M, N) result##M##N += Aval##M * Bval##N
-#if defined(B0)
-#define STORE_S(M, N) C[(j+N)*ldc + i + M] = result##M##N * alpha
-#else
-#define STORE_S(M, N) C[(j+N)*ldc + i + M] = result##M##N * alpha + C[(j+N)*ldc + i + M] * beta
-#endif
-
 #if defined(B0)
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
 #else
@@ -594,184 +552,4 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 		free(mbuf);
 		return;
 	}
-	__m256 alpha_256 = _mm256_broadcastss_ps(_mm_load_ss(&alpha));
-#if !defined(B0)
-	__m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta));
-#endif
-	for (; i < m8; i += 8) {
-		for (j = 0; j < n4; j += 4) {
-			DECLARE_RESULT_256(0, 0);
-			DECLARE_RESULT_256(0, 1);
-			DECLARE_RESULT_256(0, 2);
-			DECLARE_RESULT_256(0, 3);
-			for (k = 0; k < K; k++) {
-				LOAD_A_256(0, x);
-				BROADCAST_LOAD_B_256(x, 0); BROADCAST_LOAD_B_256(x, 1);
-				BROADCAST_LOAD_B_256(x, 2); BROADCAST_LOAD_B_256(x, 3);
-
-				MATMUL_256(0, 0);
-				MATMUL_256(0, 1);
-				MATMUL_256(0, 2);
-				MATMUL_256(0, 3);
-			}
-			STORE_256(0, 0);
-			STORE_256(0, 1);
-			STORE_256(0, 2);
-			STORE_256(0, 3);
-		}
-		for (; j < n2; j += 2) {
-			DECLARE_RESULT_256(0, 0);
-			DECLARE_RESULT_256(0, 1);
-			for (k = 0; k < K; k++) {
-				LOAD_A_256(0, x);
-				BROADCAST_LOAD_B_256(x, 0); BROADCAST_LOAD_B_256(x, 1);
-				MATMUL_256(0, 0);
-				MATMUL_256(0, 1);
-			}
-			STORE_256(0, 0);
-			STORE_256(0, 1);
-		}
-		for (; j < N; j++) {
-			DECLARE_RESULT_256(0, 0);
-			for (k = 0; k < K; k++) {
-				LOAD_A_256(0, x);
-				BROADCAST_LOAD_B_256(x, 0);
-				MATMUL_256(0, 0);
-			}
-			STORE_256(0, 0);
-		}
-	}
-	__m128 alpha_128 = _mm_broadcastss_ps(_mm_load_ss(&alpha));
-#if !defined(B0)
-	__m128 beta_128 = _mm_broadcastss_ps(_mm_load_ss(&beta));
-#endif
-	for (; i < m4; i += 4) {
-		for (j = 0; j < n4; j += 4) {
-			DECLARE_RESULT_128(0, 0);
-			DECLARE_RESULT_128(0, 1);
-			DECLARE_RESULT_128(0, 2);
-			DECLARE_RESULT_128(0, 3);
-			for (k = 0; k < K; k++) {
-				LOAD_A_128(0, x);
-				BROADCAST_LOAD_B_128(x, 0); BROADCAST_LOAD_B_128(x, 1);
-				BROADCAST_LOAD_B_128(x, 2); BROADCAST_LOAD_B_128(x, 3);
-
-				MATMUL_128(0, 0);
-				MATMUL_128(0, 1);
-				MATMUL_128(0, 2);
-				MATMUL_128(0, 3);
-			}
-			STORE_128(0, 0);
-			STORE_128(0, 1);
-			STORE_128(0, 2);
-			STORE_128(0, 3);
-		}
-		for (; j < n2; j += 2) {
-			DECLARE_RESULT_128(0, 0);
-			DECLARE_RESULT_128(0, 1);
-			for (k = 0; k < K; k++) {
-				LOAD_A_128(0, x);
-				BROADCAST_LOAD_B_128(x, 0); BROADCAST_LOAD_B_128(x, 1);
-				MATMUL_128(0, 0);
-				MATMUL_128(0, 1);
-			}
-			STORE_128(0, 0);
-			STORE_128(0, 1);
-		}
-		for (; j < N; j++) {
-			DECLARE_RESULT_128(0, 0);
-			for (k = 0; k < K; k++) {
-				LOAD_A_128(0, x);
-				BROADCAST_LOAD_B_128(x, 0);
-				MATMUL_128(0, 0);
-			}
-			STORE_128(0, 0);
-		}
-	}
-	for (; i < m2; i += 2) {
-		for (j = 0; j < n4; j += 4) {
-			DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0);
-			DECLARE_RESULT_S(0, 1); DECLARE_RESULT_S(1, 1);
-			DECLARE_RESULT_S(0, 2); DECLARE_RESULT_S(1, 2);
-			DECLARE_RESULT_S(0, 3); DECLARE_RESULT_S(1, 3);
-			for (k = 0; k < K; k++) {
-				LOAD_A_S(0, x); LOAD_A_S(1, x);
-				BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1);
-				BROADCAST_LOAD_B_S(x, 2); BROADCAST_LOAD_B_S(x, 3);
-
-				MATMUL_S(0, 0); MATMUL_S(1, 0);
-				MATMUL_S(0, 1); MATMUL_S(1, 1);
-				MATMUL_S(0, 2); MATMUL_S(1, 2);
-				MATMUL_S(0, 3); MATMUL_S(1, 3);
-			}
-			STORE_S(0, 0); STORE_S(1, 0);
-			STORE_S(0, 1); STORE_S(1, 1);
-			STORE_S(0, 2); STORE_S(1, 2);
-			STORE_S(0, 3); STORE_S(1, 3);
-		}
-		for (; j < n2; j += 2) {
-			DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0);
-			DECLARE_RESULT_S(0, 1); DECLARE_RESULT_S(1, 1);
-			for (k = 0; k < K; k++) {
-				LOAD_A_S(0, x); LOAD_A_S(1, x);
-				BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1);
-				MATMUL_S(0, 0); MATMUL_S(1, 0);
-				MATMUL_S(0, 1); MATMUL_S(1, 1);
-			}
-			STORE_S(0, 0); STORE_S(1, 0);
-			STORE_S(0, 1); STORE_S(1, 1);
-		}
-		for (; j < N; j++) {
-			DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0);
-			for (k = 0; k < K; k++) {
-				LOAD_A_S(0, x); LOAD_A_S(1, x);
-				BROADCAST_LOAD_B_S(x, 0);
-				MATMUL_S(0, 0); MATMUL_S(1, 0);
-			}
-			STORE_S(0, 0); STORE_S(1, 0);
-		}
-	}
-	for (; i < M; i += 1) {
-		for (j = 0; j < n4; j += 4) {
-			DECLARE_RESULT_S(0, 0);
-			DECLARE_RESULT_S(0, 1);
-			DECLARE_RESULT_S(0, 2);
-			DECLARE_RESULT_S(0, 3);
-			for (k = 0; k < K; k++) {
-				LOAD_A_S(0, x);
-				BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1);
-				BROADCAST_LOAD_B_S(x, 2); BROADCAST_LOAD_B_S(x, 3);
-
-				MATMUL_S(0, 0);
-				MATMUL_S(0, 1);
-				MATMUL_S(0, 2);
-				MATMUL_S(0, 3);
-			}
-			STORE_S(0, 0);
-			STORE_S(0, 1);
-			STORE_S(0, 2);
-			STORE_S(0, 3);
-		}
-		for (; j < n2; j += 2) {
-			DECLARE_RESULT_S(0, 0);
-			DECLARE_RESULT_S(0, 1);
-			for (k = 0; k < K; k++) {
-				LOAD_A_S(0, x);
-				BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1);
-				MATMUL_S(0, 0);
-				MATMUL_S(0, 1);
-			}
-			STORE_S(0, 0);
-			STORE_S(0, 1);
-		}
-		for (; j < N; j++) {
-			DECLARE_RESULT_S(0, 0);
-			for (k = 0; k < K; k++) {
-				LOAD_A_S(0, x); LOAD_A_S(1, x);
-				BROADCAST_LOAD_B_S(x, 0);
-				MATMUL_S(0, 0);
-			}
-			STORE_S(0, 0);
-		}
-	}
 }

From 13b32f69b78b15e7d95978011ea6c2bb3d9e3642 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Wed, 12 May 2021 17:08:18 +0000
Subject: [PATCH 032/143] Small Matrix: skylakex: sgemm nn: reduce store 4 M at
 a time

---
 .../x86_64/sgemm_small_kernel_nn_skylakex.c   | 64 ++++++++++++++-----
 1 file changed, 47 insertions(+), 17 deletions(-)

diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
index a5c530593..be9f085c0 100644
--- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -57,10 +57,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k])
 #define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &mbuf[(mi + M)*K + k])
 #define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k])
+#define REDUCE_M4(N) \
+	__m512 r0, r1, r2, r3, t0, t1, t2, t3;\
+	r0 = _mm512_unpacklo_ps(result0##N, result1##N); r1 = _mm512_unpackhi_ps(result0##N, result1##N); \
+	r2 = _mm512_unpacklo_ps(result2##N, result3##N); r3 = _mm512_unpackhi_ps(result2##N, result3##N); \
+	t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \
+	t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \
+	r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \
+	__m128 s0, s1, s2, s3; \
+	s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \
+	s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \
+	s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0);
 #if defined(B0)
 #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N);
+#define STORE_REDUCE_M4(N) {\
+	REDUCE_M4(N) \
+	_mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \
+}
 #else
 #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M];
+#define STORE_REDUCE_M4(N) {\
+	REDUCE_M4(N) \
+	asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \
+	_mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \
+}
 #endif
 
 #if defined(B0)
@@ -75,14 +95,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 	BLASLONG m64 = M & ~63;
 	BLASLONG m32 = M & ~31;
 	BLASLONG m16 = M & ~15;
-	BLASLONG m8 = M & ~7;
 	BLASLONG m4 = M & ~3;
 	BLASLONG m2 = M & ~1;
 
 	BLASLONG n4 = N & ~3;
 	BLASLONG n2 = N & ~1;
 
-	__mmask8 mask = 0xff;  // just use to avoid SSE instruction
 
 	__m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha));
 #if !defined(B0)
@@ -220,8 +238,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			STORE_512(0, 0);
 		}
 	}
-	if (M - i > 8) {
-		register __mmask16 mask asm("k1") = (1UL << (M - i)) - 1;
+	int mm = M - i;
+	if (!mm) return 0;
+	if (mm > 8 || K < 32) {
+		register __mmask16 mask asm("k1") = (1UL << mm) - 1;
 		for (j = 0; j < n4; j += 4) {
 			DECLARE_RESULT_512(0, 0);
 			DECLARE_RESULT_512(0, 1);
@@ -263,10 +283,20 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			}
 			MASK_STORE_512(0, 0);
 		}
-		return;
-	}
-	int mm = M - i;
-	if (mm) {
+	} else {
+		/* M => [1, 8]
+		 *
+		 * This kernel use dot-like style to calc a value - C(x, y):
+		 * C(x, y) = A(x, 0)*B(0, y) + A(x, 1)*B(1, y) +....+ A(x, K)*B(K, y)
+		 *
+		 * Alloc a buf to copy rest of A as row major,
+		 * so memory access from 0 to K is continuous for both A & B.
+		 *
+		 * Loading to zmm and FMA 16 of k at one loop,
+		 * finally reduce_add zmm to a single float result in C(x, y).
+		 *
+		 * Note: performance is bad when K is small.
+		 */
 		FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K);
 		__mmask8 mask8 = (1UL << mm) - 1;
 		__mmask16 mask;
@@ -328,6 +358,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			}
 		}
 		int mi = 0;
+		mask8 = 0xff;  // just use to avoid SSE instruction
+		__m128 alpha_128 = _mm_broadcast_ss(&alpha);
+#if !defined(B0)
+		__m128 beta_128 = _mm_broadcast_ss(&beta);
+#endif
 		for (; i < m4; i += 4, mi += 4) {
 			for (j = 0; j < n4; j += 4) {
 				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
@@ -354,10 +389,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
 					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
 				}
-				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0);
-				STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); STORE_REDUCE(2, 1); STORE_REDUCE(3, 1);
-				STORE_REDUCE(0, 2); STORE_REDUCE(1, 2); STORE_REDUCE(2, 2); STORE_REDUCE(3, 2);
-				STORE_REDUCE(0, 3); STORE_REDUCE(1, 3); STORE_REDUCE(2, 3); STORE_REDUCE(3, 3);
+				STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3);
 			}
 			for (; j < n2; j += 2) {
 				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
@@ -378,9 +410,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
 					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
 				}
-				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0);
-				STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); STORE_REDUCE(2, 1); STORE_REDUCE(3, 1);
-
+				STORE_REDUCE_M4(0); STORE_REDUCE_M4(1);
 			}
 			for (; j < N; j += 1) {
 				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
@@ -398,7 +428,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 
 					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
 				}
-				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0);
+				STORE_REDUCE_M4(0);
 			}
 
 		}
@@ -550,6 +580,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			}
 		}
 		free(mbuf);
-		return;
 	}
+	return 0;
 }

From 4c9d9940fdd6a458289a02e850afd65d5b9689ba Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 13 May 2021 09:41:51 +0000
Subject: [PATCH 033/143] Small Matrix: skylakex: sgemm nn: reduce store 4 N at
 a time

---
 .../x86_64/sgemm_small_kernel_nn_skylakex.c   | 29 ++++++++++++-------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
index be9f085c0..c9f43f9a2 100644
--- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -57,10 +57,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k])
 #define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &mbuf[(mi + M)*K + k])
 #define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k])
-#define REDUCE_M4(N) \
+#define REDUCE_4(rr0, rr1, rr2, rr3) \
 	__m512 r0, r1, r2, r3, t0, t1, t2, t3;\
-	r0 = _mm512_unpacklo_ps(result0##N, result1##N); r1 = _mm512_unpackhi_ps(result0##N, result1##N); \
-	r2 = _mm512_unpacklo_ps(result2##N, result3##N); r3 = _mm512_unpackhi_ps(result2##N, result3##N); \
+	r0 = _mm512_unpacklo_ps(rr0, rr1); r1 = _mm512_unpackhi_ps(rr0, rr1); \
+	r2 = _mm512_unpacklo_ps(rr2, rr3); r3 = _mm512_unpackhi_ps(rr2, rr3); \
 	t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \
 	t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \
 	r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \
@@ -68,12 +68,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \
 	s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \
 	s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0);
+#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N)
+#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3)
 #if defined(B0)
 #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N);
 #define STORE_REDUCE_M4(N) {\
 	REDUCE_M4(N) \
 	_mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \
 }
+#define STORE_REDUCE_N4(M) {\
+	REDUCE_N4(M) \
+	_mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); \
+}
 #else
 #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M];
 #define STORE_REDUCE_M4(N) {\
@@ -81,6 +87,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \
 	_mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \
 }
+#define STORE_REDUCE_N4(M) {\
+	REDUCE_N4(M) \
+	s1 = _mm_i32gather_ps(&C[j*ldc + i + M], vindex_n, 4); \
+	s0 = _mm_fmadd_ps(s1, beta_128, s0); \
+	_mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); \
+}
 #endif
 
 #if defined(B0)
@@ -363,6 +375,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 #if !defined(B0)
 		__m128 beta_128 = _mm_broadcast_ss(&beta);
 #endif
+		__m128i vindex_n = _mm_set_epi32(ldc*3, ldc*2, ldc, 0);
 		for (; i < m4; i += 4, mi += 4) {
 			for (j = 0; j < n4; j += 4) {
 				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
@@ -458,10 +471,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 					MATMUL_512(0, 2); MATMUL_512(1, 2);
 					MATMUL_512(0, 3); MATMUL_512(1, 3);
 				}
-				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
-				STORE_REDUCE(0, 1); STORE_REDUCE(1, 1);
-				STORE_REDUCE(0, 2); STORE_REDUCE(1, 2);
-				STORE_REDUCE(0, 3); STORE_REDUCE(1, 3);
+				STORE_REDUCE_N4(0); STORE_REDUCE_N4(1);
 			}
 			for (; j < n2; j += 2) {
 				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
@@ -532,10 +542,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 					MATMUL_512(0, 2);
 					MATMUL_512(0, 3);
 				}
-				STORE_REDUCE(0, 0);
-				STORE_REDUCE(0, 1);
-				STORE_REDUCE(0, 2);
-				STORE_REDUCE(0, 3);
+				STORE_REDUCE_N4(0);
 			}
 			for (; j < n2; j += 2) {
 				DECLARE_RESULT_512(0, 0);

From a87736346fd3988618c0d8895827566fce5a5487 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 13 May 2021 10:16:54 +0000
Subject: [PATCH 034/143] Small Matrix: skylakex: sgemm nn: add n6 to improve
 performance

---
 .../x86_64/sgemm_small_kernel_nn_skylakex.c   | 90 ++++++++++++++++++-
 1 file changed, 87 insertions(+), 3 deletions(-)

diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
index c9f43f9a2..a67541161 100644
--- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -110,6 +110,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 	BLASLONG m4 = M & ~3;
 	BLASLONG m2 = M & ~1;
 
+	BLASLONG n6 = N - (N % 6);
 	BLASLONG n4 = N & ~3;
 	BLASLONG n2 = N & ~1;
 
@@ -165,7 +166,34 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 		}
 	}
 	for (; i < m32; i += 32) {
-		for (j = 0; j < n4; j += 4) {
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4);
+			DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+				MATMUL_512(0, 4); MATMUL_512(1, 4);
+				MATMUL_512(0, 5); MATMUL_512(1, 5);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+			STORE_512(0, 4); STORE_512(1, 4);
+			STORE_512(0, 5); STORE_512(1, 5);
+		}
+		for (;j < n4; j += 4) {
 			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
 			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
 			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
@@ -208,7 +236,34 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 		}
 	}
 	for (; i < m16; i += 16) {
-		for (j = 0; j < n4; j += 4) {
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+			STORE_512(0, 4);
+			STORE_512(0, 5);
+		}
+		for (; j < n4; j += 4) {
 			DECLARE_RESULT_512(0, 0);
 			DECLARE_RESULT_512(0, 1);
 			DECLARE_RESULT_512(0, 2);
@@ -228,6 +283,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			STORE_512(0, 2);
 			STORE_512(0, 3);
 		}
+
 		for (; j < n2; j += 2) {
 			DECLARE_RESULT_512(0, 0);
 			DECLARE_RESULT_512(0, 1);
@@ -254,26 +310,54 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 	if (!mm) return 0;
 	if (mm > 8 || K < 32) {
 		register __mmask16 mask asm("k1") = (1UL << mm) - 1;
-		for (j = 0; j < n4; j += 4) {
+		for (j = 0; j < n6; j += 6) {
 			DECLARE_RESULT_512(0, 0);
 			DECLARE_RESULT_512(0, 1);
 			DECLARE_RESULT_512(0, 2);
 			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
 			for (k = 0; k < K; k++) {
 				MASK_LOAD_A_512(0, x);
 				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
 				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
 
 				MATMUL_512(0, 0);
 				MATMUL_512(0, 1);
 				MATMUL_512(0, 2);
 				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
 			}
 			MASK_STORE_512(0, 0);
 			MASK_STORE_512(0, 1);
 			MASK_STORE_512(0, 2);
 			MASK_STORE_512(0, 3);
+			MASK_STORE_512(0, 4);
+			MASK_STORE_512(0, 5);
 		}
+		for (; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+		}
+
 		for (; j < n2; j += 2) {
 			DECLARE_RESULT_512(0, 0);
 			DECLARE_RESULT_512(0, 1);

From 9967e61abb3ba0b87a043662382c515ed9d220bb Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Wed, 19 May 2021 10:50:03 +0000
Subject: [PATCH 035/143] Small Matrix: skylakex: sgemm nn: fix error when beta
 not zero

---
 kernel/x86_64/sgemm_small_kernel_nn_skylakex.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
index a67541161..99856d0af 100644
--- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -42,15 +42,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 			_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
 #else
 #define STORE_512(M, N) \
-	BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \
 	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
-	asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512)); \
-	_mm512_storeu_ps(&C[offset##M##N], result##M##N)
+	asm("vfmadd231ps (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512)); \
+	_mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N)
 #define MASK_STORE_512(M, N) \
-	BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \
 	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
-	asm("vfmadd231ps (%1, %2, 4), %3, %0 %{%4%}": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512), "k"(mask)); \
-	_mm512_mask_storeu_ps(&C[offset##M##N], mask, result##M##N)
+	asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \
+	_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
 #endif
 
 #define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&mbuf[(mi + M)*K + k]);

From ca7682e3a3dceeb52ba1ad554f384388ffb24c9a Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 20 May 2021 11:24:31 +0000
Subject: [PATCH 036/143] Small Matrix: skylakex: sgemm nn: fix n6 conflicts
 with n4

---
 .../x86_64/sgemm_small_kernel_nn_skylakex.c   | 62 -------------------
 1 file changed, 62 deletions(-)

diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
index 99856d0af..9bc7a7c58 100644
--- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -191,26 +191,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			STORE_512(0, 4); STORE_512(1, 4);
 			STORE_512(0, 5); STORE_512(1, 5);
 		}
-		for (;j < n4; j += 4) {
-			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
-			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
-			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
-			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
-			for (k = 0; k < K; k++) {
-				LOAD_A_512(0, x); LOAD_A_512(1, x);
-				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
-				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
-
-				MATMUL_512(0, 0); MATMUL_512(1, 0);
-				MATMUL_512(0, 1); MATMUL_512(1, 1);
-				MATMUL_512(0, 2); MATMUL_512(1, 2);
-				MATMUL_512(0, 3); MATMUL_512(1, 3);
-			}
-			STORE_512(0, 0); STORE_512(1, 0);
-			STORE_512(0, 1); STORE_512(1, 1);
-			STORE_512(0, 2); STORE_512(1, 2);
-			STORE_512(0, 3); STORE_512(1, 3);
-		}
 		for (; j < n2; j += 2) {
 			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
 			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
@@ -261,27 +241,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			STORE_512(0, 4);
 			STORE_512(0, 5);
 		}
-		for (; j < n4; j += 4) {
-			DECLARE_RESULT_512(0, 0);
-			DECLARE_RESULT_512(0, 1);
-			DECLARE_RESULT_512(0, 2);
-			DECLARE_RESULT_512(0, 3);
-			for (k = 0; k < K; k++) {
-				LOAD_A_512(0, x);
-				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
-				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
-
-				MATMUL_512(0, 0);
-				MATMUL_512(0, 1);
-				MATMUL_512(0, 2);
-				MATMUL_512(0, 3);
-			}
-			STORE_512(0, 0);
-			STORE_512(0, 1);
-			STORE_512(0, 2);
-			STORE_512(0, 3);
-		}
-
 		for (; j < n2; j += 2) {
 			DECLARE_RESULT_512(0, 0);
 			DECLARE_RESULT_512(0, 1);
@@ -335,27 +294,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			MASK_STORE_512(0, 4);
 			MASK_STORE_512(0, 5);
 		}
-		for (; j < n4; j += 4) {
-			DECLARE_RESULT_512(0, 0);
-			DECLARE_RESULT_512(0, 1);
-			DECLARE_RESULT_512(0, 2);
-			DECLARE_RESULT_512(0, 3);
-			for (k = 0; k < K; k++) {
-				MASK_LOAD_A_512(0, x);
-				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
-				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
-
-				MATMUL_512(0, 0);
-				MATMUL_512(0, 1);
-				MATMUL_512(0, 2);
-				MATMUL_512(0, 3);
-			}
-			MASK_STORE_512(0, 0);
-			MASK_STORE_512(0, 1);
-			MASK_STORE_512(0, 2);
-			MASK_STORE_512(0, 3);
-		}
-
 		for (; j < n2; j += 2) {
 			DECLARE_RESULT_512(0, 0);
 			DECLARE_RESULT_512(0, 1);

From 0d72d75bf9455c91b6f0c4ecf5b7555845dccf6f Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 20 May 2021 11:47:10 +0000
Subject: [PATCH 037/143] Small Matrix: skylakex: add sgemm nt kernel

---
 kernel/x86_64/KERNEL.SKYLAKEX                 |   2 +
 .../sgemm_small_kernel_b0_nt_skylakex.c       |   2 +
 .../x86_64/sgemm_small_kernel_nt_skylakex.c   | 366 ++++++++++++++++++
 3 files changed, 370 insertions(+)
 create mode 100644 kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c
 create mode 100644 kernel/x86_64/sgemm_small_kernel_nt_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 1a2e67b52..d3560bf80 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -12,6 +12,8 @@ STRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT =  ../generic/trsm_kernel_RT.c
 SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c
 SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c
+SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c
+SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_b0_nt_skylakex.c
 
 DGEMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
 DTRMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
diff --git a/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c
new file mode 100644
index 000000000..6d7934be1
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c
@@ -0,0 +1,2 @@
+#define B0 1
+#include "./sgemm_small_kernel_nt_skylakex.c"
diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c
new file mode 100644
index 000000000..3fc842669
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c
@@ -0,0 +1,366 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+#include <memory.h>
+
+#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps()
+#define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)])
+#define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)])
+#define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[ldb * k + j + N]))
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N)
+#if defined(B0)
+#define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+			_mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N)
+#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+			_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
+#else
+#define STORE_512(M, N) \
+	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+	asm("vfmadd231ps (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512)); \
+	_mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N)
+#define MASK_STORE_512(M, N) \
+	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+	asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \
+	_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
+#endif
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m64 = M & ~63;
+	BLASLONG m32 = M & ~31;
+	BLASLONG m16 = M & ~15;
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n8 = N & ~7;
+	BLASLONG n6 = N - (N % 6);
+	BLASLONG n4 = N & ~3;
+	BLASLONG n2 = N & ~1;
+
+
+	__m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha));
+#if !defined(B0)
+	__m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta));
+#endif
+
+	for (i = 0; i < m64; i += 64) {
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+			DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); DECLARE_RESULT_512(2, 4); DECLARE_RESULT_512(3, 4);
+			DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); DECLARE_RESULT_512(2, 5); DECLARE_RESULT_512(3, 5);
+
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				MATMUL_512(0, 4); MATMUL_512(1, 4); MATMUL_512(2, 4); MATMUL_512(3, 4);
+				MATMUL_512(0, 5); MATMUL_512(1, 5); MATMUL_512(2, 5); MATMUL_512(3, 5);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+			STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2);
+			STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3);
+			STORE_512(0, 4); STORE_512(1, 4); STORE_512(2, 4); STORE_512(3, 4);
+			STORE_512(0, 5); STORE_512(1, 5); STORE_512(2, 5); STORE_512(3, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+		}
+	}
+	for (; i < m32; i += 32) {
+		for (j = 0; j < n8; j += 8) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4);
+			DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5);
+			DECLARE_RESULT_512(0, 6); DECLARE_RESULT_512(1, 6);
+			DECLARE_RESULT_512(0, 7); DECLARE_RESULT_512(1, 7);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+				MATMUL_512(0, 4); MATMUL_512(1, 4);
+				MATMUL_512(0, 5); MATMUL_512(1, 5);
+				MATMUL_512(0, 6); MATMUL_512(1, 6);
+				MATMUL_512(0, 7); MATMUL_512(1, 7);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+			STORE_512(0, 4); STORE_512(1, 4);
+			STORE_512(0, 5); STORE_512(1, 5);
+			STORE_512(0, 6); STORE_512(1, 6);
+			STORE_512(0, 7); STORE_512(1, 7);
+		}
+		for (;j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+		}
+	}
+	for (; i < m16; i += 16) {
+		for (j = 0; j < n8; j += 8) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			DECLARE_RESULT_512(0, 6);
+			DECLARE_RESULT_512(0, 7);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+				MATMUL_512(0, 6);
+				MATMUL_512(0, 7);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+			STORE_512(0, 4);
+			STORE_512(0, 5);
+			STORE_512(0, 6);
+			STORE_512(0, 7);
+		}
+		for (; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+		}
+
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			STORE_512(0, 0);
+		}
+	}
+	int mm = M - i;
+	if (mm > 0) {
+		register __mmask16 mask asm("k1") = (1UL << mm) - 1;
+		for (j = 0; j < n8; j += 8) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			DECLARE_RESULT_512(0, 6);
+			DECLARE_RESULT_512(0, 7);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+				MATMUL_512(0, 6);
+				MATMUL_512(0, 7);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+			MASK_STORE_512(0, 4);
+			MASK_STORE_512(0, 5);
+			MASK_STORE_512(0, 6);
+			MASK_STORE_512(0, 7);
+		}
+		for (; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+		}
+
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			MASK_STORE_512(0, 0);
+		}
+	}
+}

From ae3f5c737c24e6fdb7de4559969bee5631aa1683 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Fri, 21 May 2021 13:31:31 +0000
Subject: [PATCH 038/143] Small Matrix: skylakex: sgemm nt: optimize for M < 12

---
 .../x86_64/sgemm_small_kernel_nt_skylakex.c   | 171 +++++++++++++++++-
 1 file changed, 170 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c
index 3fc842669..f293bf9f9 100644
--- a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c
@@ -35,11 +35,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)])
 #define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[ldb * k + j + N]))
 #define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N)
+
+#define BROADCAST_LOAD_A_512(M, N) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[lda * k + i + M]))
+#define LOAD_B_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[ldb * k + j + (N*16)])
+#define MASK_LOAD_B_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[ldb * k + j + (N*16)])
 #if defined(B0)
 #define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
 			_mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N)
 #define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
 			_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				_mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				_mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4)
 #else
 #define STORE_512(M, N) \
 	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
@@ -49,6 +57,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
 	asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \
 	_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				__m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \
+				result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \
+				_mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				__m512 tmp##M##N = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask, vindex_n, &C[(j + N*16)*ldc + i + M], 4); \
+				result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \
+				_mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4);
 #endif
 
 #if defined(B0)
@@ -66,6 +82,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 	BLASLONG m4 = M & ~3;
 	BLASLONG m2 = M & ~1;
 
+	BLASLONG n64 = N & ~63;
+	BLASLONG n32 = N & ~31;
 	BLASLONG n8 = N & ~7;
 	BLASLONG n6 = N - (N % 6);
 	BLASLONG n4 = N & ~3;
@@ -284,7 +302,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 		}
 	}
 	int mm = M - i;
-	if (mm > 0) {
+	if (mm >= 12) {
 		register __mmask16 mask asm("k1") = (1UL << mm) - 1;
 		for (j = 0; j < n8; j += 8) {
 			DECLARE_RESULT_512(0, 0);
@@ -362,5 +380,156 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			}
 			MASK_STORE_512(0, 0);
 		}
+	} else if (mm > 0) {
+		int index_n[16];
+		for (int ii = 0; ii < 16; ii++) {
+			index_n[ii] = ii * ldc;
+		}
+		__m512i vindex_n = _mm512_loadu_epi32(index_n);
+		for (; i < m4; i += 4) {
+			for (j = 0; j < n64; j += 64) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					LOAD_B_512(x, 2);
+					LOAD_B_512(x, 3);
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1);
+				SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); SCATTER_STORE_512(2, 2); SCATTER_STORE_512(3, 2);
+				SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); SCATTER_STORE_512(2, 3); SCATTER_STORE_512(3, 3);
+			}
+			for (; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1);
+			}
+			__mmask16 mask = 0xffff;
+			for (; j < N; j += 16) {
+				int remains = N - j;
+				if (remains < 16) mask = (1UL << remains) - 1;
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); MASK_SCATTER_STORE_512(2, 0); MASK_SCATTER_STORE_512(3, 0);
+			}
+		}
+		for (; i < m2; i += 2) {
+			for (j = 0; j < n64; j += 64) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					LOAD_B_512(x, 2);
+					LOAD_B_512(x, 3);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+				SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2);
+				SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3);
+			}
+			for (; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+			}
+			__mmask16 mask = 0xffff;
+			for (; j < N; j += 16) {
+				int remains = N - j;
+				if (remains < 16) mask = (1UL << remains) - 1;
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0);
+			}
+		}
+		for (; i < M; i += 1) {
+			for (j = 0; j < n64; j += 64) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				DECLARE_RESULT_512(0, 2);
+				DECLARE_RESULT_512(0, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					LOAD_B_512(x, 2);
+					LOAD_B_512(x, 3);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+				SCATTER_STORE_512(0, 2);
+				SCATTER_STORE_512(0, 3);
+			}
+			for (; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+			}
+			__mmask16 mask = 0xffff;
+			for (; j < N; j += 16) {
+				int remains = N - j;
+				if (remains < 16) mask = (1UL << remains) - 1;
+				DECLARE_RESULT_512(0, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0);
+			}
+		}
 	}
+	return 0;
 }

From 642c3938790b45606dea7450a6fbc23b6c9b9b9c Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Wed, 26 May 2021 16:30:57 +0000
Subject: [PATCH 039/143] Small Matrix: skylakex: add sgemm tn kernel

---
 kernel/x86_64/KERNEL.SKYLAKEX                 |   2 +
 .../sgemm_small_kernel_b0_tn_skylakex.c       |   2 +
 .../x86_64/sgemm_small_kernel_tn_skylakex.c   | 316 ++++++++++++++++++
 3 files changed, 320 insertions(+)
 create mode 100644 kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c
 create mode 100644 kernel/x86_64/sgemm_small_kernel_tn_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index d3560bf80..5e0d9e5b4 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -14,6 +14,8 @@ SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c
 SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c
 SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c
 SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_b0_nt_skylakex.c
+SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_skylakex.c
+SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_b0_tn_skylakex.c
 
 DGEMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
 DTRMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
diff --git a/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c
new file mode 100644
index 000000000..0f9745b72
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c
@@ -0,0 +1,2 @@
+#define B0 1
+#include "./sgemm_small_kernel_tn_skylakex.c"
diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
new file mode 100644
index 000000000..5a9a4ea32
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
@@ -0,0 +1,316 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+#include <memory.h>
+
+#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps()
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N)
+
+#define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[(i + M)*lda + k]);
+#define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k])
+#define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[(i + M)*lda + k])
+#define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k])
+
+#define REDUCE_4(rr0, rr1, rr2, rr3) \
+	__m512 r0, r1, r2, r3, t0, t1, t2, t3;\
+	r0 = _mm512_unpacklo_ps(rr0, rr1); r1 = _mm512_unpackhi_ps(rr0, rr1); \
+	r2 = _mm512_unpacklo_ps(rr2, rr3); r3 = _mm512_unpackhi_ps(rr2, rr3); \
+	t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \
+	t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \
+	r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \
+	__m128 s0, s1, s2, s3; \
+	s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \
+	s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \
+	s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0);
+
+#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N)
+#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3)
+
+#if defined(B0)
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N)
+#define STORE_M4(N, s0) _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0);
+#define STORE_N4(M, s0) _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4);
+#else
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M]
+#define STORE_M4(N, s0) \
+	asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \
+	_mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0);
+
+#define STORE_N4(M, s0) \
+	s0 = _mm_fmadd_ps(_mm_i32gather_ps(&C[j*ldc + i + M], vindex_n, 4), beta_128, s0); \
+	_mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4);
+#endif
+#define STORE_REDUCE_M4(N) {\
+	REDUCE_M4(N) \
+	STORE_M4(N, s0) \
+}
+#define STORE_REDUCE_N4(M) {\
+	REDUCE_N4(M) \
+	STORE_N4(M, s0) \
+}
+
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n4 = N & ~3;
+	BLASLONG n2 = N & ~1;
+
+	BLASLONG k16 = K & ~15;
+
+	__mmask16 mask;
+	__mmask8 mask8 = 0xff;  // just use to avoid SSE instruction
+
+	__m128i vindex_n = _mm_set_epi32(ldc*3, ldc*2, ldc, 0);
+	__m128 alpha_128 = _mm_broadcast_ss(&alpha);
+#if !defined(B0)
+	__m128 beta_128 = _mm_broadcast_ss(&beta);
+#endif
+	for (i = 0; i < m4; i += 4) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			STORE_REDUCE_M4(0); STORE_REDUCE_M4(1);
+		}
+		for (; j < N; j += 1) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+				LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+				MASK_LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			STORE_REDUCE_M4(0);
+		}
+
+	}
+	for (; i < m2; i += 2) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			STORE_REDUCE_N4(0); STORE_REDUCE_N4(1);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+			STORE_REDUCE(0, 1); STORE_REDUCE(1, 1);
+
+		}
+		for (; j < N; j += 1) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+				LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+				MASK_LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+		}
+	}
+	for (; i < M; i += 1) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			STORE_REDUCE_N4(0);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_REDUCE(0, 0);
+			STORE_REDUCE(0, 1);
+
+		}
+		for (; j < N; j += 1) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x);
+				LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x);
+				MASK_LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0);
+			}
+			STORE_REDUCE(0, 0);
+		}
+	}
+	return 0;
+}

From 5dc7c3c8e572c1760cd9aba40dde1db54bb3f2e3 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 27 May 2021 11:03:56 +0000
Subject: [PATCH 040/143] Small Matrix: add GEMM_SMALL_MATRIX_PERMIT to tune
 small matrics case

---
 common_c.h                                 |  2 ++
 common_d.h                                 |  1 +
 common_level3.h                            |  8 +++++
 common_macro.h                             | 10 ++++++
 common_s.h                                 |  2 ++
 common_z.h                                 |  2 ++
 interface/gemm.c                           |  9 +++---
 kernel/Makefile.L3                         | 31 ++++++++++++++++++
 kernel/generic/gemm_small_matrix_permit.c  | 37 ++++++++++++++++++++++
 kernel/generic/zgemm_small_matrix_permit.c | 37 ++++++++++++++++++++++
 10 files changed, 135 insertions(+), 4 deletions(-)
 create mode 100644 kernel/generic/gemm_small_matrix_permit.c
 create mode 100644 kernel/generic/zgemm_small_matrix_permit.c

diff --git a/common_c.h b/common_c.h
index 9388ece93..dc273eef0 100644
--- a/common_c.h
+++ b/common_c.h
@@ -232,6 +232,8 @@
 
 #define CGEADD_K                cgeadd_k 
 
+#define CGEMM_SMALL_MATRIX_PERMIT	cgemm_small_matrix_permit
+
 #define CGEMM_SMALL_KERNEL_NN   cgemm_small_kernel_nn
 #define CGEMM_SMALL_KERNEL_NT   cgemm_small_kernel_nt
 #define CGEMM_SMALL_KERNEL_NR   cgemm_small_kernel_nr
diff --git a/common_d.h b/common_d.h
index 42c14e828..bb85f1232 100644
--- a/common_d.h
+++ b/common_d.h
@@ -157,6 +157,7 @@
 #define DIMATCOPY_K_RT      dimatcopy_k_rt
 #define DGEADD_K                dgeadd_k 
 
+#define DGEMM_SMALL_MATRIX_PERMIT	dgemm_small_matrix_permit
 
 #define DGEMM_SMALL_KERNEL_NN   dgemm_small_kernel_nn
 #define DGEMM_SMALL_KERNEL_NT   dgemm_small_kernel_nt
diff --git a/common_level3.h b/common_level3.h
index a3a487dab..187402a9a 100644
--- a/common_level3.h
+++ b/common_level3.h
@@ -516,11 +516,15 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xd
 #endif
 
 #ifdef SMALL_MATRIX_OPT
+int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
+
 int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
 int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
 int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
 int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
 
+int dgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta);
+
 int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
 int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
 int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
@@ -536,6 +540,8 @@ int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLA
 int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
 int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
 
+int cgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1);
+
 int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
 int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
 int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
@@ -556,6 +562,8 @@ int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLON
 int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
 int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
 
+int zgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1);
+
 int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
 int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
 int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
diff --git a/common_macro.h b/common_macro.h
index 2cccf9b39..aeb9a205b 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -644,6 +644,8 @@
 
 #define GEADD_K                 DGEADD_K
 
+#define GEMM_SMALL_MATRIX_PERMIT	DGEMM_SMALL_MATRIX_PERMIT
+
 #define GEMM_SMALL_KERNEL_NN    DGEMM_SMALL_KERNEL_NN
 #define GEMM_SMALL_KERNEL_NT    DGEMM_SMALL_KERNEL_NT
 #define GEMM_SMALL_KERNEL_TN    DGEMM_SMALL_KERNEL_TN
@@ -940,6 +942,8 @@
 
 #define GEADD_K 		SGEADD_K
 
+#define GEMM_SMALL_MATRIX_PERMIT	SGEMM_SMALL_MATRIX_PERMIT
+
 #define GEMM_SMALL_KERNEL_NN    SGEMM_SMALL_KERNEL_NN
 #define GEMM_SMALL_KERNEL_NT    SGEMM_SMALL_KERNEL_NT
 #define GEMM_SMALL_KERNEL_TN    SGEMM_SMALL_KERNEL_TN
@@ -1256,6 +1260,8 @@
 
 #define GEADD_K 		SGEADD_K
 
+#define GEMM_SMALL_MATRIX_PERMIT	SGEMM_SMALL_MATRIX_PERMIT
+
 #define GEMM_SMALL_KERNEL_NN    SGEMM_SMALL_KERNEL_NN
 #define GEMM_SMALL_KERNEL_NT    SGEMM_SMALL_KERNEL_NT
 #define GEMM_SMALL_KERNEL_TN    SGEMM_SMALL_KERNEL_TN
@@ -2093,6 +2099,8 @@
 
 #define GEADD_K                 ZGEADD_K
 
+#define GEMM_SMALL_MATRIX_PERMIT	ZGEMM_SMALL_MATRIX_PERMIT
+
 #define GEMM_SMALL_KERNEL_NN    ZGEMM_SMALL_KERNEL_NN
 #define GEMM_SMALL_KERNEL_NT    ZGEMM_SMALL_KERNEL_NT
 #define GEMM_SMALL_KERNEL_NR    ZGEMM_SMALL_KERNEL_NR
@@ -2556,6 +2564,8 @@
 
 #define GEADD_K                 CGEADD_K
 
+#define GEMM_SMALL_MATRIX_PERMIT	CGEMM_SMALL_MATRIX_PERMIT
+
 #define GEMM_SMALL_KERNEL_NN    CGEMM_SMALL_KERNEL_NN
 #define GEMM_SMALL_KERNEL_NT    CGEMM_SMALL_KERNEL_NT
 #define GEMM_SMALL_KERNEL_NR    CGEMM_SMALL_KERNEL_NR
diff --git a/common_s.h b/common_s.h
index 685d73062..5851014cf 100644
--- a/common_s.h
+++ b/common_s.h
@@ -164,6 +164,8 @@
 
 #define SGEADD_K                sgeadd_k 
 
+#define SGEMM_SMALL_MATRIX_PERMIT	sgemm_small_matrix_permit
+
 #define SGEMM_SMALL_KERNEL_NN   sgemm_small_kernel_nn
 #define SGEMM_SMALL_KERNEL_NT   sgemm_small_kernel_nt
 #define SGEMM_SMALL_KERNEL_TN   sgemm_small_kernel_tn
diff --git a/common_z.h b/common_z.h
index 8594ec74d..6088260a1 100644
--- a/common_z.h
+++ b/common_z.h
@@ -232,6 +232,8 @@
 
 #define ZGEADD_K                zgeadd_k 
 
+#define ZGEMM_SMALL_MATRIX_PERMIT	zgemm_small_matrix_permit
+
 #define ZGEMM_SMALL_KERNEL_NN   zgemm_small_kernel_nn
 #define ZGEMM_SMALL_KERNEL_NT   zgemm_small_kernel_nt
 #define ZGEMM_SMALL_KERNEL_NR   zgemm_small_kernel_nr
diff --git a/interface/gemm.c b/interface/gemm.c
index 7251993ee..ad8780668 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -464,25 +464,26 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 #endif
 
 #ifdef SMALL_MATRIX_OPT
-  //need to tune small matrices cases.
-  if(MNK <= 100.0*100.0*100.0){
-	  
 #if !defined(COMPLEX)
+  if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){
 	  if(*(FLOAT *)(args.beta) == 0.0){
 		  (gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc);
 	  }else{
 		  (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
 	  }
+	  return;
+  }
 #else
+  if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, alpha[0], alpha[1], beta[0], beta[1])){
 	  if(beta[0] == 0.0 && beta[1] == 0.0){
 		  (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc);
 	  }else{
 		  (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc);
 	  }
-#endif	  
 	  return;
   }
 #endif
+#endif
   
 
   buffer = (XFLOAT *)blas_memory_alloc(0);
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 1c4a00158..f977793a0 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -451,18 +451,21 @@ endif
 ifeq ($(SMALL_MATRIX_OPT), 1)
 
 SBLASOBJS += \
+	sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
 	sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
 	sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
 	sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
 	sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
 
 DBLASOBJS += \
+	dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
 	dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
 	dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
 	dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
 	dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
 
 CBLASOBJS += \
+	cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
 	cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
 	cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
 	cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
@@ -481,6 +484,7 @@ CBLASOBJS += \
 	cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
 
 ZBLASOBJS += \
+	zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
 	zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
 	zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
 	zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
@@ -4294,6 +4298,10 @@ $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
 
 ######  BLAS small matrix optimization #####
 
+ifndef DGEMM_SMALL_M_PERMIT
+DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
+endif
+
 ifndef DGEMM_SMALL_K_NN
 DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
 endif
@@ -4310,6 +4318,9 @@ ifndef DGEMM_SMALL_K_TT
 DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
 endif
 
+$(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
 $(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
@@ -4350,6 +4361,9 @@ $(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL
 $(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
+ifndef SGEMM_SMALL_M_PERMIT
+SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
+endif
 
 ifndef SGEMM_SMALL_K_NN
 SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
@@ -4367,6 +4381,9 @@ ifndef SGEMM_SMALL_K_TT
 SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
 endif
 
+$(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
 $(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
@@ -4407,6 +4424,9 @@ $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL
 $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
+ifndef CGEMM_SMALL_M_PERMIT
+CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
+endif
 
 ifndef CGEMM_SMALL_K_NN
 CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
@@ -4424,6 +4444,9 @@ ifndef CGEMM_SMALL_K_TT
 CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
 endif
 
+$(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@
+
 $(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
 	
@@ -4536,6 +4559,10 @@ $(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL
 $(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
 
+ifndef ZGEMM_SMALL_M_PERMIT
+ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
+endif
+
 ifndef ZGEMM_SMALL_K_NN
 ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
 endif
@@ -4552,6 +4579,10 @@ ifndef ZGEMM_SMALL_K_TT
 ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
 endif
 
+$(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@
+
+
 $(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
 	
diff --git a/kernel/generic/gemm_small_matrix_permit.c b/kernel/generic/gemm_small_matrix_permit.c
new file mode 100644
index 000000000..6e1ab1fc1
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_permit.c
@@ -0,0 +1,37 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
+{
+	double MNK = (double) M * (double) N * (double) K;
+	if (MNK <= 100.0*100.0*100.0)
+		return 1;
+	else
+		return 0;
+}
diff --git a/kernel/generic/zgemm_small_matrix_permit.c b/kernel/generic/zgemm_small_matrix_permit.c
new file mode 100644
index 000000000..288937256
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_permit.c
@@ -0,0 +1,37 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha0, FLOAT alpha1, FLOAT beta0, FLOAT beta1)
+{
+	double MNK = (double) M * (double) N * (double) K;
+	if (MNK <= 100.0*100.0*100.0)
+		return 1;
+	else
+		return 0;
+}

From 02c6e764f2e94779ae5699ca2ea8c2189aa9fa02 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 27 May 2021 11:26:49 +0000
Subject: [PATCH 041/143] Small Matrix: skylakex: add SGEMM_SMALL_M_PERMIT and
 tune for TN kernel

---
 kernel/x86_64/KERNEL.SKYLAKEX                 |  1 +
 .../sgemm_small_kernel_permit_skylakex.c      | 50 +++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 kernel/x86_64/sgemm_small_kernel_permit_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 5e0d9e5b4..264e3a9f4 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -10,6 +10,7 @@ STRSMKERNEL_LN =  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT =  ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT =  ../generic/trsm_kernel_RT.c
+SGEMM_SMALL_M_PERMIT = sgemm_small_kernel_permit_skylakex.c
 SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c
 SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c
 SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c
diff --git a/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c
new file mode 100644
index 000000000..159ae10b5
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c
@@ -0,0 +1,50 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
+{
+	double MNK = (double) M * (double) N * (double) K;
+	if (MNK > 100.0*100.0*100.0)  // disable for big size matrix
+		return 0;
+	// tuning for A transpose
+	if (transa) {
+		if (transb) {
+			return 0;  // TT kernel not support yet
+		} else {  // TN kernel
+			/* TN kernel perform not good when:
+			 * 1. C matrix is too big
+			 * 2. K is too small
+			 */
+			if (M * N > 1200 || K < 32)
+				return 0;
+		}
+	}
+
+	return 1;
+}

From 72e070539cd13364c8a02ac34e3dfcd65b657c7a Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Mon, 31 May 2021 14:53:03 +0000
Subject: [PATCH 042/143] Small Matrix: skylakex: add sgemm tt kernel

---
 kernel/x86_64/KERNEL.SKYLAKEX                 |   2 +
 .../sgemm_small_kernel_b0_tt_skylakex.c       |   3 +
 .../sgemm_small_kernel_permit_skylakex.c      |   7 +-
 .../x86_64/sgemm_small_kernel_tt_skylakex.c   | 414 ++++++++++++++++++
 4 files changed, 424 insertions(+), 2 deletions(-)
 create mode 100644 kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c
 create mode 100644 kernel/x86_64/sgemm_small_kernel_tt_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 264e3a9f4..0f58a4d46 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -17,6 +17,8 @@ SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c
 SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_b0_nt_skylakex.c
 SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_skylakex.c
 SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_b0_tn_skylakex.c
+SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_skylakex.c
+SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_b0_tt_skylakex.c
 
 DGEMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
 DTRMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
diff --git a/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c
new file mode 100644
index 000000000..27d9e0afd
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c
@@ -0,0 +1,3 @@
+#define B0 1
+#define TT 1
+#include "./sgemm_small_kernel_tt_skylakex.c"
diff --git a/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c
index 159ae10b5..cbf2374bd 100644
--- a/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c
@@ -35,8 +35,11 @@ int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alph
 	// tuning for A transpose
 	if (transa) {
 		if (transb) {
-			return 0;  // TT kernel not support yet
-		} else {  // TN kernel
+			/* TT kernel perform not good when:
+			 * 1. K is too small.
+			 */
+			if (K < 4) return 0;
+		} else {
 			/* TN kernel perform not good when:
 			 * 1. C matrix is too big
 			 * 2. K is too small
diff --git a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c
new file mode 100644
index 000000000..8da560ef7
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c
@@ -0,0 +1,414 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+
+#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps()
+#define BROADCAST_LOAD_A_512(M, N) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k  + lda * (i+M)]))
+#define LOAD_B_512(M,N)  __m512 Bval##N = _mm512_loadu_ps(&B[ldb * k + j + (N*16)])
+#define MASK_LOAD_B_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[ldb * k + j + (N*16)])
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N)
+
+#if defined(B0)
+#define STORE_8xy(v, N, x, y) _mm256_storeu_ps(&C[(j + N*16 + x + y*8)*ldc + i], v)
+#define STORE_4xy(v, N, x, y) _mm_mask_storeu_ps(&C[(j + N*16 + x + y*4)*ldc + i], mask8, v)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				_mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				    _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4);
+#else
+#define STORE_8xy(v, N, x, y) \
+	asm("vfmadd231ps (%1), %2, %0": "+v"(v): "r"(&C[(j + N*16 + x + y*8)*ldc + i]), "v"(beta_256)); \
+	_mm256_storeu_ps(&C[(j + N*16 + x + y*8)*ldc + i], v)
+#define STORE_4xy(v, N, x, y) \
+	asm("vfmadd231ps (%1), %2, %0": "+v"(v): "r"(&C[(j + N*16 + x + y*4)*ldc + i]), "v"(beta_128)); \
+	_mm_mask_storeu_ps(&C[(j + N*16 + x + y*4)*ldc + i], mask8, v)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				__m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \
+				result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \
+				_mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				__m512 tmp##M##N = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask, vindex_n, &C[(j + N*16)*ldc + i + M], 4); \
+				result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \
+				_mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4);
+#endif
+
+#define REORDER_8x16(r0, r1, r2, r3, r4, r5, r6, r7) \
+	__m512 t0, t1, t2, t3, t4, t5, t6, t7, v; \
+	t0 = _mm512_unpacklo_ps(r0, r1); \
+	t1 = _mm512_unpackhi_ps(r0, r1); \
+	t2 = _mm512_unpacklo_ps(r2, r3); \
+	t3 = _mm512_unpackhi_ps(r2, r3); \
+	t4 = _mm512_unpacklo_ps(r4, r5); \
+	t5 = _mm512_unpackhi_ps(r4, r5); \
+	t6 = _mm512_unpacklo_ps(r6, r7); \
+	t7 = _mm512_unpackhi_ps(r6, r7); \
+	v = _mm512_shuffle_ps(t0, t2, 0x4E);  \
+	r0 = _mm512_mask_blend_ps(kc, t0, v); \
+	r1 = _mm512_mask_blend_ps(k3, t2, v); \
+	v = _mm512_shuffle_ps(t1, t3, 0x4E);  \
+	r2 = _mm512_mask_blend_ps(kc, t1, v); \
+	r3 = _mm512_mask_blend_ps(k3, t3, v); \
+	v = _mm512_shuffle_ps(t4, t6, 0x4E);  \
+	r4 = _mm512_mask_blend_ps(kc, t4, v); \
+	r5 = _mm512_mask_blend_ps(k3, t6, v); \
+	v = _mm512_shuffle_ps(t5, t7, 0x4E);  \
+	r6 = _mm512_mask_blend_ps(kc, t5, v); \
+	r7 = _mm512_mask_blend_ps(k3, t7, v); \
+	t0 = _mm512_permutex2var_ps(r0, idx_lo, r4); \
+	t1 = _mm512_permutex2var_ps(r1, idx_lo, r5); \
+	t2 = _mm512_permutex2var_ps(r2, idx_lo, r6); \
+	t3 = _mm512_permutex2var_ps(r3, idx_lo, r7); \
+	t4 = _mm512_permutex2var_ps(r0, idx_hi, r4); \
+	t5 = _mm512_permutex2var_ps(r1, idx_hi, r5); \
+	t6 = _mm512_permutex2var_ps(r2, idx_hi, r6); \
+	t7 = _mm512_permutex2var_ps(r3, idx_hi, r7); \
+	t0 = _mm512_mul_ps(t0, alpha_512); \
+	t1 = _mm512_mul_ps(t1, alpha_512); \
+	t2 = _mm512_mul_ps(t2, alpha_512); \
+	t3 = _mm512_mul_ps(t3, alpha_512); \
+	t4 = _mm512_mul_ps(t4, alpha_512); \
+	t5 = _mm512_mul_ps(t5, alpha_512); \
+	t6 = _mm512_mul_ps(t6, alpha_512); \
+	t7 = _mm512_mul_ps(t7, alpha_512);
+
+#define SAVE_8(N, x, y) {\
+	__m256 v8 = _mm512_extractf32x8_ps(t##x, y); \
+	STORE_8xy(v8, N, x, y); \
+}
+
+#define REORDER_STORE_8x16(N) {\
+	REORDER_8x16(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \
+	SAVE_8(N, 0, 0); SAVE_8(N, 1, 0); SAVE_8(N, 2, 0); SAVE_8(N, 3, 0); SAVE_8(N, 4, 0); SAVE_8(N, 5, 0); SAVE_8(N, 6, 0); SAVE_8(N, 7, 0); \
+	SAVE_8(N, 0, 1); SAVE_8(N, 1, 1); SAVE_8(N, 2, 1); SAVE_8(N, 3, 1); SAVE_8(N, 4, 1); SAVE_8(N, 5, 1); SAVE_8(N, 6, 1); SAVE_8(N, 7, 1); \
+}
+
+#define MASK_SAVE_8() \
+	switch (nn) { \
+		case 16: SAVE_8(0, 7, 1); \
+		case 15: SAVE_8(0, 6, 1); \
+		case 14: SAVE_8(0, 5, 1); \
+		case 13: SAVE_8(0, 4, 1); \
+		case 12: SAVE_8(0, 3, 1); \
+		case 11: SAVE_8(0, 2, 1); \
+		case 10: SAVE_8(0, 1, 1); \
+		case 9: SAVE_8(0, 0, 1); \
+		case 8: SAVE_8(0, 7, 0); \
+		case 7: SAVE_8(0, 6, 0); \
+		case 6: SAVE_8(0, 5, 0); \
+		case 5: SAVE_8(0, 4, 0); \
+		case 4: SAVE_8(0, 3, 0); \
+		case 3: SAVE_8(0, 2, 0); \
+		case 2: SAVE_8(0, 1, 0); \
+		case 1: SAVE_8(0, 0, 0); \
+	}
+
+#define MASK_REORDER_STORE_8x16(N) {\
+	REORDER_8x16(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \
+	MASK_SAVE_8(); \
+}
+
+#define REORDER_4x16(r0, r1, r2, r3) \
+	__m512 t0, t1, t2, t3, v; \
+	t0 = _mm512_unpacklo_ps(r0, r1); \
+	t1 = _mm512_unpackhi_ps(r0, r1); \
+	t2 = _mm512_unpacklo_ps(r2, r3); \
+	t3 = _mm512_unpackhi_ps(r2, r3); \
+	v = _mm512_shuffle_ps(t0, t2, 0x4E);  \
+	r0 = _mm512_mask_blend_ps(kc, t0, v); \
+	r1 = _mm512_mask_blend_ps(k3, t2, v); \
+	v = _mm512_shuffle_ps(t1, t3, 0x4E);  \
+	r2 = _mm512_mask_blend_ps(kc, t1, v); \
+	r3 = _mm512_mask_blend_ps(k3, t3, v); \
+	t0 = _mm512_mul_ps(r0, alpha_512); \
+	t1 = _mm512_mul_ps(r1, alpha_512); \
+	t2 = _mm512_mul_ps(r2, alpha_512); \
+	t3 = _mm512_mul_ps(r3, alpha_512);
+
+#define SAVE_4(N, x, y) {\
+	__m128 v4 = _mm512_extractf32x4_ps(t##x, y); \
+	STORE_4xy(v4, N, x, y); \
+}
+
+#define REORDER_STORE_4x16(N) {\
+	REORDER_4x16(result0##N, result1##N, result2##N, result3##N); \
+	SAVE_4(N, 0, 0); SAVE_4(N, 1, 0); SAVE_4(N, 2, 0); SAVE_4(N, 3, 0); \
+	SAVE_4(N, 0, 1); SAVE_4(N, 1, 1); SAVE_4(N, 2, 1); SAVE_4(N, 3, 1); \
+	SAVE_4(N, 0, 2); SAVE_4(N, 1, 2); SAVE_4(N, 2, 2); SAVE_4(N, 3, 2); \
+	SAVE_4(N, 0, 3); SAVE_4(N, 1, 3); SAVE_4(N, 2, 3); SAVE_4(N, 3, 3); \
+}
+
+#define MASK_SAVE_4() \
+	switch (nn) { \
+		case 16: SAVE_4(0, 3, 3); \
+		case 15: SAVE_4(0, 2, 3); \
+		case 14: SAVE_4(0, 1, 3); \
+		case 13: SAVE_4(0, 0, 3); \
+		case 12: SAVE_4(0, 3, 2); \
+		case 11: SAVE_4(0, 2, 2); \
+		case 10: SAVE_4(0, 1, 2); \
+		case 9: SAVE_4(0, 0, 2); \
+		case 8: SAVE_4(0, 3, 1); \
+		case 7: SAVE_4(0, 2, 1); \
+		case 6: SAVE_4(0, 1, 1); \
+		case 5: SAVE_4(0, 0, 1); \
+		case 4: SAVE_4(0, 3, 0); \
+		case 3: SAVE_4(0, 2, 0); \
+		case 2: SAVE_4(0, 1, 0); \
+		case 1: SAVE_4(0, 0, 0); \
+	}
+
+#define MASK_REORDER_STORE_4x16(N) {\
+	REORDER_4x16(result0##N, result1##N, result2##N, result3##N); \
+	MASK_SAVE_4(); \
+}
+
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m8 = M & ~7;
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n64 = N & ~63;
+	BLASLONG n32 = N & ~31;
+
+	__m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha));
+#if !defined(B0)
+	__m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta));
+	__m128 beta_128 = _mm_broadcastss_ps(_mm_load_ss(&beta));
+#endif
+	int permute_table[] = {
+		0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
+		0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
+	};
+	__m512i idx_lo = _mm512_loadu_epi32(permute_table);
+	__m512i idx_hi = _mm512_loadu_epi32(permute_table + 16); 
+	__mmask16 kc = 0xcccc;
+	__mmask16 k3 = 0x3333;
+	__mmask8 mask8 = 0xff;  // force use AVX128 instead of SSE
+
+	for (i = 0; i < m8; i += 8) {
+		for (j = 0; j < n32; j += 32) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0);
+
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(4, 1); DECLARE_RESULT_512(5, 1); DECLARE_RESULT_512(6, 1); DECLARE_RESULT_512(7, 1);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x);
+				LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(4, 1); MATMUL_512(5, 1); MATMUL_512(6, 1); MATMUL_512(7, 1);
+			}
+			REORDER_STORE_8x16(0);
+			REORDER_STORE_8x16(1);
+		}
+		__mmask16 mask = 0xffff;
+		int nn = 16;
+		for (; j < N; j += 16) {
+			if (N - j < 16) {
+				nn = N - j;
+				mask = (1UL << nn) - 1;
+			}
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x);
+				MASK_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0);
+			}
+			MASK_REORDER_STORE_8x16(0);
+		}
+	}
+	for (; i < m4; i += 4) {
+		for (j = 0; j < n64; j += 64) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			REORDER_STORE_4x16(0);
+			REORDER_STORE_4x16(1);
+			REORDER_STORE_4x16(2);
+			REORDER_STORE_4x16(3);
+		}
+		for (; j < n32; j += 32) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			REORDER_STORE_4x16(0);
+			REORDER_STORE_4x16(1);
+		}
+		__mmask16 mask = 0xffff;
+		int nn = 16;
+		for (; j < N; j += 16) {
+			if (N - j < 16) {
+				nn = N - j;
+				mask = (1UL << nn) - 1;
+			}
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				MASK_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			MASK_REORDER_STORE_4x16(0);
+		}
+	}
+	if (i < M) {
+		int index_n[16];
+		for (int ii = 0; ii < 16; ii++) {
+			index_n[ii] = ii * ldc;
+		}
+		__m512i vindex_n = _mm512_loadu_epi32(index_n);
+#if !defined(B0)
+		__m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta));
+#endif
+		for (; i < m2; i += 2) {
+			for (j = 0; j < n64; j += 64) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+				SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2);
+				SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3);
+			}
+			for (; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+			}
+			__mmask16 mask = 0xffff;
+			int nn = 16;
+			for (; j < N; j += 16) {
+				if (N - j < 16) {
+					nn = N - j;
+					mask = (1UL << nn) - 1;
+				}
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0);
+			}
+		}
+		for (; i < M; i += 1) {
+			for (j = 0; j < n64; j += 64) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				DECLARE_RESULT_512(0, 2);
+				DECLARE_RESULT_512(0, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+				SCATTER_STORE_512(0, 2);
+				SCATTER_STORE_512(0, 3);
+			}
+			for (; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+			}
+			__mmask16 mask = 0xffff;
+			int nn = 16;
+			for (; j < N; j += 16) {
+				if (N - j < 16) {
+					nn = N - j;
+					mask = (1UL << nn) - 1;
+				}
+				DECLARE_RESULT_512(0, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0);
+			}
+		}
+	}
+	return 0;
+}

From 91ec21202bd8ae81f15dae79e004b2f00d20e559 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 1 Jun 2021 11:31:50 +0000
Subject: [PATCH 043/143] Small Matrix: skylakex: add dgemm nn kernel

---
 kernel/x86_64/KERNEL.SKYLAKEX                 |   2 +
 .../dgemm_small_kernel_b0_nn_skylakex.c       |   2 +
 .../x86_64/dgemm_small_kernel_nn_skylakex.c   | 590 ++++++++++++++++++
 3 files changed, 594 insertions(+)
 create mode 100644 kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c
 create mode 100644 kernel/x86_64/dgemm_small_kernel_nn_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 0f58a4d46..a3c6f0556 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -27,6 +27,8 @@ DGEMMITCOPY    =  dgemm_tcopy_16_skylakex.c
 DGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
 DGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
 DTRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
+DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c
+DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c
 
 SGEMM_BETA = sgemm_beta_skylakex.c
 DGEMM_BETA = dgemm_beta_skylakex.c
diff --git a/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c
new file mode 100644
index 000000000..a58738a25
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c
@@ -0,0 +1,2 @@
+#define B0 1
+#include "./dgemm_small_kernel_nn_skylakex.c"
diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
new file mode 100644
index 000000000..8ffb899c8
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
@@ -0,0 +1,590 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+#include <memory.h>
+
+#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd()
+#define LOAD_A_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[lda * k + i + (M*8)])
+#define MASK_LOAD_A_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[lda * k + i + (M*8)])
+#define BROADCAST_LOAD_B_512(M, N) __m512d Bval##N = _mm512_broadcastsd_pd(_mm_load_pd1(&B[k + ldb * (j+N)]))
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N)
+#if defined(B0)
+#define STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+			_mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N)
+#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+			_mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N)
+#else
+#define STORE_512(M, N) \
+	result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+	asm("vfmadd231pd (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512)); \
+	_mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N)
+#define MASK_STORE_512(M, N) \
+	result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+	asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \
+	_mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N)
+#endif
+
+#define LOAD_KA_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&mbuf[(mi + M)*K + k]);
+#define LOAD_KB_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[(j + N)*ldb + k])
+#define MASK_LOAD_KA_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &mbuf[(mi + M)*K + k])
+#define MASK_LOAD_KB_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[(j + N)*ldb + k])
+#define REDUCE_4(rr0, rr1, rr2, rr3) \
+	__m512d r0, r1, r2, r3, t0, t1, t2, t3;\
+	r0 = _mm512_unpacklo_pd(rr0, rr1); r1 = _mm512_unpackhi_pd(rr0, rr1); \
+	r2 = _mm512_unpacklo_pd(rr2, rr3); r3 = _mm512_unpackhi_pd(rr2, rr3); \
+	t0 = _mm512_permutex2var_pd(r0, idx_lo, r2); t1 = _mm512_permutex2var_pd(r1, idx_lo, r3); \
+	t2 = _mm512_permutex2var_pd(r0, idx_hi, r2); t3 = _mm512_permutex2var_pd(r1, idx_hi, r3); \
+	r0 = _mm512_add_pd(t0, t1); r1 = _mm512_add_pd(t2, t3); t0 = _mm512_add_pd(r0, r1); \
+	__m256d s0, s1; \
+	s0 = _mm512_extractf64x4_pd(t0, 0); s1 = _mm512_extractf64x4_pd(t0, 1); \
+	s0 = _mm256_add_pd(s0, s1); s0 = _mm256_mul_pd(alpha_256, s0);
+#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N)
+#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3)
+#if defined(B0)
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N);
+#define STORE_REDUCE_M4(N) {\
+	REDUCE_M4(N) \
+	_mm256_storeu_pd(&C[(j + N)*ldc + i], s0); \
+}
+#define STORE_REDUCE_N4(M) {\
+	REDUCE_N4(M) \
+	_mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); \
+}
+#else
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) + beta * C[(j+N)*ldc + i + M];
+#define STORE_REDUCE_M4(N) {\
+	REDUCE_M4(N) \
+	asm("vfmadd231pd (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_256)); \
+	_mm256_storeu_pd(&C[(j + N)*ldc + i], s0); \
+}
+#define STORE_REDUCE_N4(M) {\
+	REDUCE_N4(M) \
+	s1 = _mm256_i64gather_pd(&C[j*ldc + i + M], vindex_n, 8); \
+	s0 = _mm256_fmadd_pd(s1, beta_256, s0); \
+	_mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); \
+}
+#endif
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m32 = M & ~31;
+	BLASLONG m16 = M & ~15;
+	BLASLONG m8 = M & ~7;
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n6 = N - (N % 6);
+	BLASLONG n4 = N & ~3;
+	BLASLONG n2 = N & ~1;
+
+
+	__m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_pd1(&alpha));
+#if !defined(B0)
+	__m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_pd1(&beta));
+#endif
+
+	for (i = 0; i < m32; i += 32) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+			STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2);
+			STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+		}
+	}
+	for (; i < m16; i += 16) {
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4);
+			DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+				MATMUL_512(0, 4); MATMUL_512(1, 4);
+				MATMUL_512(0, 5); MATMUL_512(1, 5);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+			STORE_512(0, 4); STORE_512(1, 4);
+			STORE_512(0, 5); STORE_512(1, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+		}
+	}
+	for (; i < m8; i += 8) {
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+			STORE_512(0, 4);
+			STORE_512(0, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			STORE_512(0, 0);
+		}
+	}
+	int mm = M - i;
+	if (!mm) return 0;
+	if (mm > 4 || K < 16) {
+		register __mmask8 mask asm("k1") = (1UL << mm) - 1;
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+			MASK_STORE_512(0, 4);
+			MASK_STORE_512(0, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			MASK_STORE_512(0, 0);
+		}
+	} else {
+		/* M => [1, 4]
+		 *
+		 * This kernel use dot-like style to calc a value - C(x, y):
+		 * C(x, y) = A(x, 0)*B(0, y) + A(x, 1)*B(1, y) +....+ A(x, K)*B(K, y)
+		 *
+		 * Alloc a buf to copy rest of A as row major,
+		 * so memory access from 0 to K is continuous for both A & B.
+		 *
+		 * Loading to zmm and FMA 8 of k at one loop,
+		 * finally reduce_add zmm to a single float result in C(x, y).
+		 *
+		 * Note: performance is bad when K is small.
+		 */
+		FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K);
+		__mmask8 mask = (1UL << mm) - 1;
+		BLASLONG k8 = K & ~7;
+		BLASLONG k4 = K & ~3;
+		for (k = 0; k < k4; k += 4) {
+			__m256d  r0, r1, r2, r3;
+			__m256d  t0, t1, t2, t3;
+			r0 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(0 + k)]);
+			r1 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(1 + k)]);
+			r2 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(2 + k)]);
+			r3 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(3 + k)]);
+
+			t0 = _mm256_unpacklo_pd(r0, r1);
+			t1 = _mm256_unpackhi_pd(r0, r1);
+			t2 = _mm256_unpacklo_pd(r2, r3);
+			t3 = _mm256_unpackhi_pd(r2, r3);
+
+			r0 = _mm256_permute2f128_pd(t0, t2, 0x20);
+			r1 = _mm256_permute2f128_pd(t1, t3, 0x20);
+			r2 = _mm256_permute2f128_pd(t0, t2, 0x31);
+			r3 = _mm256_permute2f128_pd(t1, t3, 0x31);
+
+			switch (mm) {
+				case 4: _mm256_storeu_pd(&mbuf[k + 3*K], r3);
+				case 3: _mm256_storeu_pd(&mbuf[k + 2*K], r2);
+				case 2: _mm256_storeu_pd(&mbuf[k + 1*K], r1);
+				case 1: _mm256_storeu_pd(&mbuf[k + 0*K], r0);
+			}
+		}
+		for (; k < K; k++) {
+			for (int ii = 0; ii < mm; ii++) {
+				mbuf[k + ii*K] = A[i + lda*k + ii];
+			}
+		}
+		int mi = 0;
+		__m256d alpha_256 = _mm256_broadcast_sd(&alpha);
+#if !defined(B0)
+		__m256d beta_256 = _mm256_broadcast_sd(&beta);
+#endif
+		__m256i vindex_n = _mm256_set_epi64x(ldc*3, ldc*2, ldc*1, 0);
+		long long permute_table[] = {
+			0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
+			2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
+		};
+		__m512i idx_lo = _mm512_loadu_epi32(permute_table);
+		__m512i idx_hi = _mm512_loadu_epi32(permute_table + 8);
+		for (; i < m4; i += 4, mi += 4) {
+			for (j = 0; j < n4; j += 4) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				}
+				STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3);
+			}
+			for (; j < n2; j += 2) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				}
+				STORE_REDUCE_M4(0); STORE_REDUCE_M4(1);
+			}
+			for (; j < N; j += 1) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+					LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+					MASK_LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				}
+				STORE_REDUCE_M4(0);
+			}
+
+		}
+		for (; i < m2; i += 2, mi += 2) {
+			for (j = 0; j < n4; j += 4) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				STORE_REDUCE_N4(0); STORE_REDUCE_N4(1);
+			}
+			for (; j < n2; j += 2) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+				STORE_REDUCE(0, 1); STORE_REDUCE(1, 1);
+
+			}
+			for (; j < N; j += 1) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+					LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+					MASK_LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+			}
+		}
+		for (; i < M; i += 1, mi += 1) {
+			for (j = 0; j < n4; j += 4) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				DECLARE_RESULT_512(0, 2);
+				DECLARE_RESULT_512(0, 3);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				STORE_REDUCE_N4(0);
+			}
+			for (; j < n2; j += 2) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				STORE_REDUCE(0, 0);
+				STORE_REDUCE(0, 1);
+
+			}
+			for (; j < N; j += 1) {
+				DECLARE_RESULT_512(0, 0);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x);
+					LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x);
+					MASK_LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0);
+				}
+				STORE_REDUCE(0, 0);
+			}
+		}
+		free(mbuf);
+	}
+	return 0;
+}

From f57fc932ac39c394e8f89bf7b6df3f1bddd315fd Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 1 Jun 2021 14:23:56 +0000
Subject: [PATCH 044/143] Small Matrix: skylakex: add dgemm nt kernel

---
 kernel/x86_64/KERNEL.SKYLAKEX                 |   2 +
 .../dgemm_small_kernel_b0_nt_skylakex.c       |   2 +
 .../x86_64/dgemm_small_kernel_nt_skylakex.c   | 535 ++++++++++++++++++
 3 files changed, 539 insertions(+)
 create mode 100644 kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c
 create mode 100644 kernel/x86_64/dgemm_small_kernel_nt_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index a3c6f0556..db1e6cbff 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -29,6 +29,8 @@ DGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
 DTRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
 DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c
 DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c
+DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c
+DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_b0_nt_skylakex.c
 
 SGEMM_BETA = sgemm_beta_skylakex.c
 DGEMM_BETA = dgemm_beta_skylakex.c
diff --git a/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c
new file mode 100644
index 000000000..eafe2ce49
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c
@@ -0,0 +1,2 @@
+#define B0 1
+#include "./dgemm_small_kernel_nt_skylakex.c"
diff --git a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c
new file mode 100644
index 000000000..0a95a68e2
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c
@@ -0,0 +1,535 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+#include <memory.h>
+
+#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd()
+#define LOAD_A_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[lda * k + i + (M*8)])
+#define MASK_LOAD_A_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[lda * k + i + (M*8)])
+#define BROADCAST_LOAD_B_512(M, N) __m512d Bval##N = _mm512_broadcastsd_pd(_mm_load_sd(&B[ldb * k + j + N]))
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N)
+
+#define BROADCAST_LOAD_A_512(M, N) __m512d Aval##M = _mm512_broadcastsd_pd(_mm_load_sd(&A[lda * k + i + M]))
+#define LOAD_B_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[ldb * k + j + (N*8)])
+#define MASK_LOAD_B_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[ldb * k + j + (N*8)])
+#if defined(B0)
+#define STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+			_mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N)
+#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+			_mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				_mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				_mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8)
+#else
+#define STORE_512(M, N) \
+	result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+	asm("vfmadd231pd (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512)); \
+	_mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N)
+#define MASK_STORE_512(M, N) \
+	result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+	asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \
+	_mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				__m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \
+				result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \
+				_mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				__m512d tmp##M##N = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), mask, vindex_n, &C[(j + N*8)*ldc + i + M], 8); \
+				result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \
+				_mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8);
+#endif
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m32 = M & ~31;
+	BLASLONG m16 = M & ~15;
+	BLASLONG m8 = M & ~7;
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n32 = N & ~31;
+	BLASLONG n16 = N & ~15;
+	BLASLONG n8 = N & ~7;
+	BLASLONG n6 = N - (N % 6);
+	BLASLONG n4 = N & ~3;
+	BLASLONG n2 = N & ~1;
+
+
+	__m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_sd(&alpha));
+#if !defined(B0)
+	__m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta));
+#endif
+
+	for (i = 0; i < m32; i += 32) {
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+			DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); DECLARE_RESULT_512(2, 4); DECLARE_RESULT_512(3, 4);
+			DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); DECLARE_RESULT_512(2, 5); DECLARE_RESULT_512(3, 5);
+
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				MATMUL_512(0, 4); MATMUL_512(1, 4); MATMUL_512(2, 4); MATMUL_512(3, 4);
+				MATMUL_512(0, 5); MATMUL_512(1, 5); MATMUL_512(2, 5); MATMUL_512(3, 5);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+			STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2);
+			STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3);
+			STORE_512(0, 4); STORE_512(1, 4); STORE_512(2, 4); STORE_512(3, 4);
+			STORE_512(0, 5); STORE_512(1, 5); STORE_512(2, 5); STORE_512(3, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+		}
+	}
+	for (; i < m16; i += 16) {
+		for (j = 0; j < n8; j += 8) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4);
+			DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5);
+			DECLARE_RESULT_512(0, 6); DECLARE_RESULT_512(1, 6);
+			DECLARE_RESULT_512(0, 7); DECLARE_RESULT_512(1, 7);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+				MATMUL_512(0, 4); MATMUL_512(1, 4);
+				MATMUL_512(0, 5); MATMUL_512(1, 5);
+				MATMUL_512(0, 6); MATMUL_512(1, 6);
+				MATMUL_512(0, 7); MATMUL_512(1, 7);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+			STORE_512(0, 4); STORE_512(1, 4);
+			STORE_512(0, 5); STORE_512(1, 5);
+			STORE_512(0, 6); STORE_512(1, 6);
+			STORE_512(0, 7); STORE_512(1, 7);
+		}
+		for (;j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+		}
+	}
+	for (; i < m8; i += 8) {
+		for (j = 0; j < n8; j += 8) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			DECLARE_RESULT_512(0, 6);
+			DECLARE_RESULT_512(0, 7);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+				MATMUL_512(0, 6);
+				MATMUL_512(0, 7);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+			STORE_512(0, 4);
+			STORE_512(0, 5);
+			STORE_512(0, 6);
+			STORE_512(0, 7);
+		}
+		for (; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+		}
+
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			STORE_512(0, 0);
+		}
+	}
+	int mm = M - i;
+	if (mm >= 6) {
+		register __mmask16 mask asm("k1") = (1UL << mm) - 1;
+		for (j = 0; j < n8; j += 8) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			DECLARE_RESULT_512(0, 6);
+			DECLARE_RESULT_512(0, 7);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+				MATMUL_512(0, 6);
+				MATMUL_512(0, 7);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+			MASK_STORE_512(0, 4);
+			MASK_STORE_512(0, 5);
+			MASK_STORE_512(0, 6);
+			MASK_STORE_512(0, 7);
+		}
+		for (; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+		}
+
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			MASK_STORE_512(0, 0);
+		}
+	} else if (mm > 0) {
+		long long index_n[8];
+		for (int ii = 0; ii < 8; ii++) {
+			index_n[ii] = ii * ldc;
+		}
+		__m512i vindex_n = _mm512_loadu_epi64(index_n);
+		for (; i < m4; i += 4) {
+			for (j = 0; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					LOAD_B_512(x, 2);
+					LOAD_B_512(x, 3);
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1);
+				SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); SCATTER_STORE_512(2, 2); SCATTER_STORE_512(3, 2);
+				SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); SCATTER_STORE_512(2, 3); SCATTER_STORE_512(3, 3);
+			}
+			for (; j < n16; j += 16) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1);
+			}
+			__mmask8 mask = 0xff;
+			for (; j < N; j += 8) {
+				int remains = N - j;
+				if (remains < 8) mask = (1UL << remains) - 1;
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); MASK_SCATTER_STORE_512(2, 0); MASK_SCATTER_STORE_512(3, 0);
+			}
+		}
+		for (; i < m2; i += 2) {
+			for (j = 0; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					LOAD_B_512(x, 2);
+					LOAD_B_512(x, 3);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+				SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2);
+				SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3);
+			}
+			for (; j < n16; j += 16) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+			}
+			__mmask8 mask = 0xff;
+			for (; j < N; j += 8) {
+				int remains = N - j;
+				if (remains < 8) mask = (1UL << remains) - 1;
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0);
+			}
+		}
+		for (; i < M; i += 1) {
+			for (j = 0; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				DECLARE_RESULT_512(0, 2);
+				DECLARE_RESULT_512(0, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					LOAD_B_512(x, 2);
+					LOAD_B_512(x, 3);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+				SCATTER_STORE_512(0, 2);
+				SCATTER_STORE_512(0, 3);
+			}
+			for (; j < n16; j += 16) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+			}
+			__mmask8 mask = 0xff;
+			for (; j < N; j += 8) {
+				int remains = N - j;
+				if (remains < 8) mask = (1UL << remains) - 1;
+				DECLARE_RESULT_512(0, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0);
+			}
+		}
+	}
+	return 0;
+}

From 323d7da4f7c21b0a285af1527a47799c4adf69f4 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Wed, 2 Jun 2021 11:45:44 +0000
Subject: [PATCH 045/143] Small Matrix: skylakex: add dgemm tt kernel

---
 kernel/x86_64/KERNEL.SKYLAKEX                 |   2 +
 .../dgemm_small_kernel_b0_tt_skylakex.c       |   2 +
 .../x86_64/dgemm_small_kernel_tt_skylakex.c   | 392 ++++++++++++++++++
 3 files changed, 396 insertions(+)
 create mode 100644 kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c
 create mode 100644 kernel/x86_64/dgemm_small_kernel_tt_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index db1e6cbff..3e84e794e 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -31,6 +31,8 @@ DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c
 DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c
 DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c
 DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_b0_nt_skylakex.c
+DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_skylakex.c
+DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_b0_tt_skylakex.c
 
 SGEMM_BETA = sgemm_beta_skylakex.c
 DGEMM_BETA = dgemm_beta_skylakex.c
diff --git a/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c
new file mode 100644
index 000000000..93fab1836
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c
@@ -0,0 +1,2 @@
+#define B0 1
+#include "./dgemm_small_kernel_tt_skylakex.c"
diff --git a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c
new file mode 100644
index 000000000..8ff79d2c8
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c
@@ -0,0 +1,392 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+
+#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd()
+#define BROADCAST_LOAD_A_512(M, N) __m512d Aval##M = _mm512_broadcastsd_pd(_mm_load_sd(&A[k  + lda * (i+M)]))
+#define LOAD_B_512(M,N)  __m512d Bval##N = _mm512_loadu_pd(&B[ldb * k + j + (N*8)])
+#define MASK_LOAD_B_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[ldb * k + j + (N*8)])
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N)
+
+#if defined(B0)
+#define STORE_8xy(v, N, x, y) _mm512_storeu_pd(&C[(j + N*8 + x + y*8)*ldc + i], v)
+#define STORE_4xy(v, N, x, y) _mm256_storeu_pd(&C[(j + N*8 + x + y*4)*ldc + i], v)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				_mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				    _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8);
+#else
+#define STORE_8xy(v, N, x, y) \
+	asm("vfmadd231pd (%1), %2, %0": "+v"(v): "r"(&C[(j + N*8 + x + y*8)*ldc + i]), "v"(beta_512)); \
+	_mm512_storeu_pd(&C[(j + N*8 + x + y*8)*ldc + i], v)
+#define STORE_4xy(v, N, x, y) \
+	asm("vfmadd231pd (%1), %2, %0": "+v"(v): "r"(&C[(j + N*8 + x + y*4)*ldc + i]), "v"(beta_256)); \
+	_mm256_storeu_pd(&C[(j + N*8 + x + y*4)*ldc + i], v)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				__m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \
+				result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \
+				_mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				__m512d tmp##M##N = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), mask, vindex_n, &C[(j + N*8)*ldc + i + M], 8); \
+				result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \
+				_mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8);
+#endif
+
+#define REORDER_8x8(r0, r1, r2, r3, r4, r5, r6, r7) \
+	__m512d t0, t1, t2, t3, t4, t5, t6, t7; \
+	t0 = _mm512_unpacklo_pd(r0, r1); \
+	t1 = _mm512_unpackhi_pd(r0, r1); \
+	t2 = _mm512_unpacklo_pd(r2, r3); \
+	t3 = _mm512_unpackhi_pd(r2, r3); \
+	t4 = _mm512_unpacklo_pd(r4, r5); \
+	t5 = _mm512_unpackhi_pd(r4, r5); \
+	t6 = _mm512_unpacklo_pd(r6, r7); \
+	t7 = _mm512_unpackhi_pd(r6, r7); \
+	r0 = _mm512_shuffle_f64x2(t0, t2, 0x88); \
+	r1 = _mm512_shuffle_f64x2(t1, t3, 0x88); \
+	r2 = _mm512_shuffle_f64x2(t0, t2, 0xdd); \
+	r3 = _mm512_shuffle_f64x2(t1, t3, 0xdd); \
+	r4 = _mm512_shuffle_f64x2(t4, t6, 0x88); \
+	r5 = _mm512_shuffle_f64x2(t5, t7, 0x88); \
+	r6 = _mm512_shuffle_f64x2(t4, t6, 0xdd); \
+	r7 = _mm512_shuffle_f64x2(t5, t7, 0xdd); \
+	t0 = _mm512_permutex2var_pd(r0, idx_lo, r4); \
+	t1 = _mm512_permutex2var_pd(r1, idx_lo, r5); \
+	t2 = _mm512_permutex2var_pd(r2, idx_lo, r6); \
+	t3 = _mm512_permutex2var_pd(r3, idx_lo, r7); \
+	t4 = _mm512_permutex2var_pd(r0, idx_hi, r4); \
+	t5 = _mm512_permutex2var_pd(r1, idx_hi, r5); \
+	t6 = _mm512_permutex2var_pd(r2, idx_hi, r6); \
+	t7 = _mm512_permutex2var_pd(r3, idx_hi, r7); \
+	t0 = _mm512_mul_pd(t0, alpha_512); \
+	t1 = _mm512_mul_pd(t1, alpha_512); \
+	t2 = _mm512_mul_pd(t2, alpha_512); \
+	t3 = _mm512_mul_pd(t3, alpha_512); \
+	t4 = _mm512_mul_pd(t4, alpha_512); \
+	t5 = _mm512_mul_pd(t5, alpha_512); \
+	t6 = _mm512_mul_pd(t6, alpha_512); \
+	t7 = _mm512_mul_pd(t7, alpha_512);
+
+#define SAVE_8(N, x) {\
+	STORE_8xy(t##x, N, x, 0); \
+}
+
+#define REORDER_STORE_8x8(N) {\
+	REORDER_8x8(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \
+	SAVE_8(N, 0); SAVE_8(N, 1); SAVE_8(N, 2); SAVE_8(N, 3); SAVE_8(N, 4); SAVE_8(N, 5); SAVE_8(N, 6); SAVE_8(N, 7); \
+}
+
+#define MASK_SAVE_8() \
+	switch (nn) { \
+		case 8: SAVE_8(0, 7); \
+		case 7: SAVE_8(0, 6); \
+		case 6: SAVE_8(0, 5); \
+		case 5: SAVE_8(0, 4); \
+		case 4: SAVE_8(0, 3); \
+		case 3: SAVE_8(0, 2); \
+		case 2: SAVE_8(0, 1); \
+		case 1: SAVE_8(0, 0); \
+	}
+
+#define MASK_REORDER_STORE_8x8(N) {\
+	REORDER_8x8(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \
+	MASK_SAVE_8(); \
+}
+
+#define REORDER_4x8(r0, r1, r2, r3) \
+	__m512d t0, t1, t2, t3; \
+	t0 = _mm512_unpacklo_pd(r0, r1); \
+	t1 = _mm512_unpackhi_pd(r0, r1); \
+	t2 = _mm512_unpacklo_pd(r2, r3); \
+	t3 = _mm512_unpackhi_pd(r2, r3); \
+	r0 = _mm512_permutex2var_pd(t0, idx_lo, t2); \
+	r1 = _mm512_permutex2var_pd(t1, idx_lo, t3); \
+	r2 = _mm512_permutex2var_pd(t0, idx_hi, t2); \
+	r3 = _mm512_permutex2var_pd(t1, idx_hi, t3); \
+	t0 = _mm512_mul_pd(r0, alpha_512); \
+	t1 = _mm512_mul_pd(r1, alpha_512); \
+	t2 = _mm512_mul_pd(r2, alpha_512); \
+	t3 = _mm512_mul_pd(r3, alpha_512);
+
+#define SAVE_4(N, x, y) {\
+	__m256d v4 = _mm512_extractf64x4_pd(t##x, y); \
+	STORE_4xy(v4, N, x, y); \
+}
+
+#define REORDER_STORE_4x8(N) {\
+	REORDER_4x8(result0##N, result1##N, result2##N, result3##N); \
+	SAVE_4(N, 0, 0); SAVE_4(N, 1, 0); SAVE_4(N, 2, 0); SAVE_4(N, 3, 0); \
+	SAVE_4(N, 0, 1); SAVE_4(N, 1, 1); SAVE_4(N, 2, 1); SAVE_4(N, 3, 1); \
+}
+
+#define MASK_SAVE_4() \
+	switch (nn) { \
+		case 8: SAVE_4(0, 3, 1); \
+		case 7: SAVE_4(0, 2, 1); \
+		case 6: SAVE_4(0, 1, 1); \
+		case 5: SAVE_4(0, 0, 1); \
+		case 4: SAVE_4(0, 3, 0); \
+		case 3: SAVE_4(0, 2, 0); \
+		case 2: SAVE_4(0, 1, 0); \
+		case 1: SAVE_4(0, 0, 0); \
+	}
+
+#define MASK_REORDER_STORE_4x8(N) {\
+	REORDER_4x8(result0##N, result1##N, result2##N, result3##N); \
+	MASK_SAVE_4(); \
+}
+
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m8 = M & ~7;
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n32 = N & ~31;
+	BLASLONG n16 = N & ~15;
+
+	__m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_sd(&alpha));
+#if !defined(B0)
+	__m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta));
+	__m256d beta_256 = _mm256_broadcastsd_pd(_mm_load_sd(&beta));
+#endif
+	long long permute_table[] = {
+		0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8,
+		2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8,
+	};
+	__m512i idx_lo = _mm512_loadu_epi64(permute_table);
+	__m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); 
+
+	for (i = 0; i < m8; i += 8) {
+		for (j = 0; j < n16; j += 16) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0);
+
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(4, 1); DECLARE_RESULT_512(5, 1); DECLARE_RESULT_512(6, 1); DECLARE_RESULT_512(7, 1);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x);
+				LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(4, 1); MATMUL_512(5, 1); MATMUL_512(6, 1); MATMUL_512(7, 1);
+			}
+			REORDER_STORE_8x8(0);
+			REORDER_STORE_8x8(1);
+		}
+		__mmask8 mask = 0xff;
+		int nn = 8;
+		for (; j < N; j += 8) {
+			if (N - j < 8) {
+				nn = N - j;
+				mask = (1UL << nn) - 1;
+			}
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x);
+				MASK_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0);
+			}
+			MASK_REORDER_STORE_8x8(0);
+		}
+	}
+	for (; i < m4; i += 4) {
+		long long permute_table2[] = {
+			0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
+			2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
+		};
+		idx_lo = _mm512_loadu_epi64(permute_table2);
+		idx_hi = _mm512_loadu_epi64(permute_table2 + 8); 
+
+		for (j = 0; j < n32; j += 32) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			REORDER_STORE_4x8(0);
+			REORDER_STORE_4x8(1);
+			REORDER_STORE_4x8(2);
+			REORDER_STORE_4x8(3);
+		}
+		for (; j < n16; j += 16) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			REORDER_STORE_4x8(0);
+			REORDER_STORE_4x8(1);
+		}
+		__mmask8 mask = 0xff;
+		int nn = 8;
+		for (; j < N; j += 8) {
+			if (N - j < 8) {
+				nn = N - j;
+				mask = (1UL << nn) - 1;
+			}
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				MASK_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			MASK_REORDER_STORE_4x8(0);
+		}
+	}
+	if (i < M) {
+		long long index_n[8];
+		for (int ii = 0; ii < 8; ii++) {
+			index_n[ii] = ii * ldc;
+		}
+		__m512i vindex_n = _mm512_loadu_epi64(index_n);
+#if !defined(B0)
+		__m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta));
+#endif
+		for (; i < m2; i += 2) {
+			for (j = 0; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+				SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2);
+				SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3);
+			}
+			for (; j < n16; j += 16) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+			}
+			__mmask8 mask = 0xff;
+			int nn = 8;
+			for (; j < N; j += 8) {
+				if (N - j < 8) {
+					nn = N - j;
+					mask = (1UL << nn) - 1;
+				}
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0);
+			}
+		}
+		for (; i < M; i += 1) {
+			for (j = 0; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				DECLARE_RESULT_512(0, 2);
+				DECLARE_RESULT_512(0, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+				SCATTER_STORE_512(0, 2);
+				SCATTER_STORE_512(0, 3);
+			}
+			for (; j < n16; j += 16) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+			}
+			__mmask8 mask = 0xff;
+			int nn = 8;
+			for (; j < N; j += 8) {
+				if (N - j < 8) {
+					nn = N - j;
+					mask = (1UL << nn) - 1;
+				}
+				DECLARE_RESULT_512(0, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0);
+			}
+		}
+	}
+	return 0;
+}

From 3e79f6d89abe60b75a4a504670a676472b2d0918 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Wed, 2 Jun 2021 13:56:40 +0000
Subject: [PATCH 046/143] Small Matrix: skylakex: add dgemm tn kernel

---
 kernel/x86_64/KERNEL.SKYLAKEX                 |   2 +
 .../dgemm_small_kernel_b0_tn_skylakex.c       |   2 +
 .../x86_64/dgemm_small_kernel_tn_skylakex.c   | 322 ++++++++++++++++++
 3 files changed, 326 insertions(+)
 create mode 100644 kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c
 create mode 100644 kernel/x86_64/dgemm_small_kernel_tn_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 3e84e794e..c1d8f8e89 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -31,6 +31,8 @@ DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c
 DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c
 DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c
 DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_b0_nt_skylakex.c
+DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_skylakex.c
+DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_b0_tn_skylakex.c
 DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_skylakex.c
 DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_b0_tt_skylakex.c
 
diff --git a/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c
new file mode 100644
index 000000000..1dfa0aaf1
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c
@@ -0,0 +1,2 @@
+#define B0 1
+#include "./dgemm_small_kernel_tn_skylakex.c"
diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
new file mode 100644
index 000000000..0881f35b2
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
@@ -0,0 +1,322 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+#include <memory.h>
+
+#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd()
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N)
+
+#define LOAD_KA_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[(i + M)*lda + k]);
+#define LOAD_KB_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[(j + N)*ldb + k])
+#define MASK_LOAD_KA_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[(i + M)*lda + k])
+#define MASK_LOAD_KB_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[(j + N)*ldb + k])
+
+#define REDUCE_4(rr0, rr1, rr2, rr3) \
+	__m512d r0, r1, r2, r3, t0, t1, t2, t3;\
+	r0 = _mm512_unpacklo_pd(rr0, rr1); r1 = _mm512_unpackhi_pd(rr0, rr1); \
+	r2 = _mm512_unpacklo_pd(rr2, rr3); r3 = _mm512_unpackhi_pd(rr2, rr3); \
+	t0 = _mm512_permutex2var_pd(r0, idx_lo, r2); t1 = _mm512_permutex2var_pd(r1, idx_lo, r3); \
+	t2 = _mm512_permutex2var_pd(r0, idx_hi, r2); t3 = _mm512_permutex2var_pd(r1, idx_hi, r3); \
+	r0 = _mm512_add_pd(t0, t1); r1 = _mm512_add_pd(t2, t3); t0 = _mm512_add_pd(r0, r1); \
+	__m256d s0, s1; \
+	s0 = _mm512_extractf64x4_pd(t0, 0); s1 = _mm512_extractf64x4_pd(t0, 1); \
+	s0 = _mm256_add_pd(s0, s1); s0 = _mm256_mul_pd(alpha_256, s0);
+
+#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N)
+#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3)
+
+#if defined(B0)
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N)
+#define STORE_M4(N, s0) _mm256_storeu_pd(&C[(j + N)*ldc + i], s0);
+#define STORE_N4(M, s0) _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8);
+#else
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) + beta * C[(j+N)*ldc + i + M]
+#define STORE_M4(N, s0) \
+	asm("vfmadd231pd (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_256)); \
+	_mm256_storeu_pd(&C[(j + N)*ldc + i], s0);
+
+#define STORE_N4(M, s0) \
+	s0 = _mm256_fmadd_pd(_mm256_i64gather_pd(&C[j*ldc + i + M], vindex_n, 8), beta_256, s0); \
+	_mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8);
+#endif
+#define STORE_REDUCE_M4(N) {\
+	REDUCE_M4(N) \
+	STORE_M4(N, s0) \
+}
+#define STORE_REDUCE_N4(M) {\
+	REDUCE_N4(M) \
+	STORE_N4(M, s0) \
+}
+
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n4 = N & ~3;
+	BLASLONG n2 = N & ~1;
+
+	BLASLONG k8 = K & ~7;
+
+	__mmask8 mask;
+
+	__m256i vindex_n = _mm256_set_epi64x(ldc*3, ldc*2, ldc, 0);
+	__m256d alpha_256 = _mm256_broadcast_sd(&alpha);
+#if !defined(B0)
+	__m256d beta_256 = _mm256_broadcast_sd(&beta);
+#endif
+
+	long long permute_table[] = {
+		0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
+		2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
+	};
+	__m512i idx_lo = _mm512_loadu_epi64(permute_table);
+	__m512i idx_hi = _mm512_loadu_epi64(permute_table + 8);
+
+	for (i = 0; i < m4; i += 4) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			STORE_REDUCE_M4(0); STORE_REDUCE_M4(1);
+		}
+		for (; j < N; j += 1) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+				LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+				MASK_LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			STORE_REDUCE_M4(0);
+		}
+
+	}
+	for (; i < m2; i += 2) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			STORE_REDUCE_N4(0); STORE_REDUCE_N4(1);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+			STORE_REDUCE(0, 1); STORE_REDUCE(1, 1);
+
+		}
+		for (; j < N; j += 1) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+				LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+				MASK_LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+		}
+	}
+	for (; i < M; i += 1) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			STORE_REDUCE_N4(0);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_REDUCE(0, 0);
+			STORE_REDUCE(0, 1);
+
+		}
+		for (; j < N; j += 1) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x);
+				LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x);
+				MASK_LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0);
+			}
+			STORE_REDUCE(0, 0);
+		}
+	}
+	return 0;
+}

From 8592c21af4d6328068b87f402a6801b30e2aebec Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Wed, 2 Jun 2021 13:57:39 +0000
Subject: [PATCH 047/143] Small Matrix: skylakex: dgemm nn: fix typo in idx
 load

---
 kernel/x86_64/dgemm_small_kernel_nn_skylakex.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
index 8ffb899c8..ff2a04beb 100644
--- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
@@ -372,8 +372,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
 			2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
 		};
-		__m512i idx_lo = _mm512_loadu_epi32(permute_table);
-		__m512i idx_hi = _mm512_loadu_epi32(permute_table + 8);
+		__m512i idx_lo = _mm512_loadu_epi64(permute_table);
+		__m512i idx_hi = _mm512_loadu_epi64(permute_table + 8);
 		for (; i < m4; i += 4, mi += 4) {
 			for (j = 0; j < n4; j += 4) {
 				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);

From fa777f5517d4b43acfda8b8a58649af94c1e40b4 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Wed, 2 Jun 2021 14:55:54 +0000
Subject: [PATCH 048/143] Small Matrix: skylakex: add DGEMM_SMALL_M_PERMIT and
 tune for TN kernel

---
 kernel/x86_64/KERNEL.SKYLAKEX                 |  1 +
 .../dgemm_small_kernel_permit_skylakex.c      | 44 +++++++++++++++++++
 2 files changed, 45 insertions(+)
 create mode 100644 kernel/x86_64/dgemm_small_kernel_permit_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index c1d8f8e89..eb0cbaf98 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -27,6 +27,7 @@ DGEMMITCOPY    =  dgemm_tcopy_16_skylakex.c
 DGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
 DGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
 DTRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
+DGEMM_SMALL_M_PERMIT = dgemm_small_kernel_permit_skylakex.c
 DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c
 DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c
 DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c
diff --git a/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c
new file mode 100644
index 000000000..9cca08e71
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c
@@ -0,0 +1,44 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
+{
+	double MNK = (double) M * (double) N * (double) K;
+	if (MNK > 100.0*100.0*100.0)  // disable for big size matrix
+		return 0;
+	if (transa && !transb) {
+		/* TN kernel perform not good when:
+		 * 1. C matrix is too big
+		 * 2. K is too small
+		 */
+		if (M * N > 1200 || K < 32)
+			return 0;
+	}
+	return 1;
+}

From 210a1584c5299d8e53129b4e2a8b73b67046cc77 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 2 Aug 2021 14:19:16 +0200
Subject: [PATCH 049/143] Rebase source and edit TLS version of the message as
 well

---
 driver/others/memory.c | 46 +++++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index f0521ab2d..500ec22c5 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -222,11 +222,11 @@ int get_num_procs(void);
 #else
 int get_num_procs(void) {
   static int nums = 0;
+
+#if defined(__GLIBC_PREREQ)
   cpu_set_t cpuset,*cpusetp;
   size_t size;
   int ret;
-
-#if defined(__GLIBC_PREREQ)
 #if !__GLIBC_PREREQ(2, 7)
   int i;
 #if !__GLIBC_PREREQ(2, 6)
@@ -428,7 +428,7 @@ extern int openblas_goto_num_threads_env();
 extern int openblas_omp_num_threads_env();
 
 int blas_get_cpu_number(void){
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   int max_num;
 #endif
   int blas_goto_num   = 0;
@@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
 
   if (blas_num_threads) return blas_num_threads;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   max_num = get_num_procs();
 #endif
 
@@ -460,7 +460,7 @@ int blas_get_cpu_number(void){
   else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
   else blas_num_threads = MAX_CPU_NUMBER;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   if (blas_num_threads > max_num) blas_num_threads = max_num;
 #endif
 
@@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock);
 
       func = &memoryalloc[0];
 
-      while ((func != NULL) && (map_address == (void *) -1)) {
+      while ((*func != NULL) && (map_address == (void *) -1)) {
 
         map_address = (*func)((void *)base_address);
 
@@ -1619,10 +1619,12 @@ static int on_process_term(void)
 #else
 #pragma data_seg(".CRT$XLB")
 #endif
-static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
+
 #ifdef _WIN64
+static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
 #pragma const_seg()
 #else
+static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
 #pragma data_seg()
 #endif
 
@@ -1631,10 +1633,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI
 #else
 #pragma data_seg(".CRT$XTU")
 #endif
-static int(*p_process_term)(void) = on_process_term;
+
 #ifdef _WIN64
+static const int(*p_process_term)(void) = on_process_term;
 #pragma const_seg()
 #else
+static int(*p_process_term)(void) = on_process_term;
 #pragma data_seg()
 #endif
 #endif
@@ -1668,16 +1672,23 @@ void gotoblas_dummy_for_PGI(void) {
 #ifndef MEM_LARGE_PAGES
 #define MEM_LARGE_PAGES  0x20000000
 #endif
-#else
+#elif !defined(OS_EMBEDDED)
 #define ALLOC_MMAP
 #define ALLOC_MALLOC
+#else
+#define ALLOC_MALLOC
+
+inline int puts(const char *str) { return 0; }
+inline int printf(const char *format, ...) { return 0; }
+inline char *getenv(const char *name) { return ""; }
+inline int atoi(const char *str) { return 0; }
 #endif
 
 #include <stdlib.h>
 #include <stdio.h>
 #include <fcntl.h>
 
-#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
+#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED)
 #include <sys/mman.h>
 #ifndef NO_SYSV_IPC
 #include <sys/shm.h>
@@ -1691,7 +1702,6 @@ void gotoblas_dummy_for_PGI(void) {
 #include <sys/sysinfo.h>
 #include <sched.h>
 #include <errno.h>
-#include <linux/unistd.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -1969,7 +1979,7 @@ extern int openblas_goto_num_threads_env();
 extern int openblas_omp_num_threads_env();
 
 int blas_get_cpu_number(void){
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   int max_num;
 #endif
   int blas_goto_num   = 0;
@@ -1977,7 +1987,7 @@ int blas_get_cpu_number(void){
 
   if (blas_num_threads) return blas_num_threads;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   max_num = get_num_procs();
 #endif
 
@@ -2001,7 +2011,7 @@ int blas_get_cpu_number(void){
   else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
   else blas_num_threads = MAX_CPU_NUMBER;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   if (blas_num_threads > max_num) blas_num_threads = max_num;
 #endif
 
@@ -2868,8 +2878,12 @@ void *blas_memory_alloc(int procpos){
   return (void *)memory[position].addr;
 
  error:
-  printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
-
+  printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
+  printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
+  printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
+  printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
+  printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
+  printf("cpu cores than what OpenBLAS was configured to handle.\n"); 
   return NULL;
 }
 

From 898212efcda215ccab3b46b4a645c8eda2ca7948 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 2 Aug 2021 14:50:14 +0200
Subject: [PATCH 050/143] Actually add the message to the TLS section

---
 driver/others/memory.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 500ec22c5..460a3d557 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -1291,7 +1291,12 @@ UNLOCK_COMMAND(&alloc_lock);
   return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
 
  error:
-  printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
+  printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n");
+  printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
+  printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
+  printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
+  printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
+  printf("cpu cores than what OpenBLAS was configured to handle.\n"); 
 
   return NULL;
 }

From 6b58bca18b427a0c149d25542a5eb7c5ada6a19f Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 15 Jun 2021 16:09:51 +0000
Subject: [PATCH 051/143] Small Matrix: disable low performance default kernel

---
 kernel/generic/gemm_small_matrix_permit.c  | 3 +++
 kernel/generic/zgemm_small_matrix_permit.c | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/kernel/generic/gemm_small_matrix_permit.c b/kernel/generic/gemm_small_matrix_permit.c
index 6e1ab1fc1..1ae6d2520 100644
--- a/kernel/generic/gemm_small_matrix_permit.c
+++ b/kernel/generic/gemm_small_matrix_permit.c
@@ -29,9 +29,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
 {
+	return 0;
+/*
 	double MNK = (double) M * (double) N * (double) K;
 	if (MNK <= 100.0*100.0*100.0)
 		return 1;
 	else
 		return 0;
+*/
 }
diff --git a/kernel/generic/zgemm_small_matrix_permit.c b/kernel/generic/zgemm_small_matrix_permit.c
index 288937256..940ff5dc8 100644
--- a/kernel/generic/zgemm_small_matrix_permit.c
+++ b/kernel/generic/zgemm_small_matrix_permit.c
@@ -29,9 +29,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha0, FLOAT alpha1, FLOAT beta0, FLOAT beta1)
 {
+	return 0;
+/*
 	double MNK = (double) M * (double) N * (double) K;
 	if (MNK <= 100.0*100.0*100.0)
 		return 1;
 	else
 		return 0;
+*/
 }

From 93c8bafff56052534554e3a47e56552c97217228 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 3 Aug 2021 10:45:45 +0200
Subject: [PATCH 052/143] Update Travis badge in README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d7e0d60a7..88a5a5035 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 [![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
-Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
+Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.com/xianyi/OpenBLAS)
 
 AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
 

From 478d1086c11f28903395bd13050dbca62aec81ef Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Wed, 4 Aug 2021 03:12:41 +0000
Subject: [PATCH 053/143] Small Matrix: support DYNAMIC_ARCH build

---
 common_c.h            |  83 +++++++++++++++--------------
 common_d.h            |  23 ++++----
 common_param.h        | 119 ++++++++++++++++++++++++++++++++++++++++++
 common_s.h            |  23 ++++----
 common_z.h            |  83 +++++++++++++++--------------
 interface/gemm.c      |  50 ++++++++++--------
 kernel/setparam-ref.c |  37 +++++++++++++
 7 files changed, 295 insertions(+), 123 deletions(-)

diff --git a/common_c.h b/common_c.h
index dc273eef0..6cff610bb 100644
--- a/common_c.h
+++ b/common_c.h
@@ -234,46 +234,6 @@
 
 #define CGEMM_SMALL_MATRIX_PERMIT	cgemm_small_matrix_permit
 
-#define CGEMM_SMALL_KERNEL_NN   cgemm_small_kernel_nn
-#define CGEMM_SMALL_KERNEL_NT   cgemm_small_kernel_nt
-#define CGEMM_SMALL_KERNEL_NR   cgemm_small_kernel_nr
-#define CGEMM_SMALL_KERNEL_NC   cgemm_small_kernel_nc
-
-#define CGEMM_SMALL_KERNEL_TN   cgemm_small_kernel_tn
-#define CGEMM_SMALL_KERNEL_TT   cgemm_small_kernel_tt
-#define CGEMM_SMALL_KERNEL_TR   cgemm_small_kernel_tr
-#define CGEMM_SMALL_KERNEL_TC   cgemm_small_kernel_tc
-
-#define CGEMM_SMALL_KERNEL_RN   cgemm_small_kernel_rn
-#define CGEMM_SMALL_KERNEL_RT   cgemm_small_kernel_rt
-#define CGEMM_SMALL_KERNEL_RR   cgemm_small_kernel_rr
-#define CGEMM_SMALL_KERNEL_RC   cgemm_small_kernel_rc
-
-#define CGEMM_SMALL_KERNEL_CN   cgemm_small_kernel_cn
-#define CGEMM_SMALL_KERNEL_CT   cgemm_small_kernel_ct
-#define CGEMM_SMALL_KERNEL_CR   cgemm_small_kernel_cr
-#define CGEMM_SMALL_KERNEL_CC   cgemm_small_kernel_cc
-
-#define CGEMM_SMALL_KERNEL_B0_NN   cgemm_small_kernel_b0_nn
-#define CGEMM_SMALL_KERNEL_B0_NT   cgemm_small_kernel_b0_nt
-#define CGEMM_SMALL_KERNEL_B0_NR   cgemm_small_kernel_b0_nr
-#define CGEMM_SMALL_KERNEL_B0_NC   cgemm_small_kernel_b0_nc
-
-#define CGEMM_SMALL_KERNEL_B0_TN   cgemm_small_kernel_b0_tn
-#define CGEMM_SMALL_KERNEL_B0_TT   cgemm_small_kernel_b0_tt
-#define CGEMM_SMALL_KERNEL_B0_TR   cgemm_small_kernel_b0_tr
-#define CGEMM_SMALL_KERNEL_B0_TC   cgemm_small_kernel_b0_tc
-
-#define CGEMM_SMALL_KERNEL_B0_RN   cgemm_small_kernel_b0_rn
-#define CGEMM_SMALL_KERNEL_B0_RT   cgemm_small_kernel_b0_rt
-#define CGEMM_SMALL_KERNEL_B0_RR   cgemm_small_kernel_b0_rr
-#define CGEMM_SMALL_KERNEL_B0_RC   cgemm_small_kernel_b0_rc
-
-#define CGEMM_SMALL_KERNEL_B0_CN   cgemm_small_kernel_b0_cn
-#define CGEMM_SMALL_KERNEL_B0_CT   cgemm_small_kernel_b0_ct
-#define CGEMM_SMALL_KERNEL_B0_CR   cgemm_small_kernel_b0_cr
-#define CGEMM_SMALL_KERNEL_B0_CC   cgemm_small_kernel_b0_cc
-
 #else
 
 #define	CAMAX_K			gotoblas -> camax_k
@@ -468,8 +428,51 @@
 
 #define CGEADD_K                gotoblas -> cgeadd_k 
 
+#define CGEMM_SMALL_MATRIX_PERMIT	gotoblas -> cgemm_small_matrix_permit
+
 #endif
 
+#define CGEMM_SMALL_KERNEL_NN		FUNC_OFFSET(cgemm_small_kernel_nn)
+#define CGEMM_SMALL_KERNEL_NT		FUNC_OFFSET(cgemm_small_kernel_nt)
+#define CGEMM_SMALL_KERNEL_NR		FUNC_OFFSET(cgemm_small_kernel_nr)
+#define CGEMM_SMALL_KERNEL_NC		FUNC_OFFSET(cgemm_small_kernel_nc)
+
+#define CGEMM_SMALL_KERNEL_TN		FUNC_OFFSET(cgemm_small_kernel_tn)
+#define CGEMM_SMALL_KERNEL_TT		FUNC_OFFSET(cgemm_small_kernel_tt)
+#define CGEMM_SMALL_KERNEL_TR		FUNC_OFFSET(cgemm_small_kernel_tr)
+#define CGEMM_SMALL_KERNEL_TC		FUNC_OFFSET(cgemm_small_kernel_tc)
+
+#define CGEMM_SMALL_KERNEL_RN		FUNC_OFFSET(cgemm_small_kernel_rn)
+#define CGEMM_SMALL_KERNEL_RT		FUNC_OFFSET(cgemm_small_kernel_rt)
+#define CGEMM_SMALL_KERNEL_RR		FUNC_OFFSET(cgemm_small_kernel_rr)
+#define CGEMM_SMALL_KERNEL_RC		FUNC_OFFSET(cgemm_small_kernel_rc)
+
+#define CGEMM_SMALL_KERNEL_CN		FUNC_OFFSET(cgemm_small_kernel_cn)
+#define CGEMM_SMALL_KERNEL_CT		FUNC_OFFSET(cgemm_small_kernel_ct)
+#define CGEMM_SMALL_KERNEL_CR		FUNC_OFFSET(cgemm_small_kernel_cr)
+#define CGEMM_SMALL_KERNEL_CC		FUNC_OFFSET(cgemm_small_kernel_cc)
+
+#define CGEMM_SMALL_KERNEL_B0_NN	FUNC_OFFSET(cgemm_small_kernel_b0_nn)
+#define CGEMM_SMALL_KERNEL_B0_NT	FUNC_OFFSET(cgemm_small_kernel_b0_nt)
+#define CGEMM_SMALL_KERNEL_B0_NR	FUNC_OFFSET(cgemm_small_kernel_b0_nr)
+#define CGEMM_SMALL_KERNEL_B0_NC	FUNC_OFFSET(cgemm_small_kernel_b0_nc)
+
+#define CGEMM_SMALL_KERNEL_B0_TN	FUNC_OFFSET(cgemm_small_kernel_b0_tn)
+#define CGEMM_SMALL_KERNEL_B0_TT	FUNC_OFFSET(cgemm_small_kernel_b0_tt)
+#define CGEMM_SMALL_KERNEL_B0_TR	FUNC_OFFSET(cgemm_small_kernel_b0_tr)
+#define CGEMM_SMALL_KERNEL_B0_TC	FUNC_OFFSET(cgemm_small_kernel_b0_tc)
+
+#define CGEMM_SMALL_KERNEL_B0_RN	FUNC_OFFSET(cgemm_small_kernel_b0_rn)
+#define CGEMM_SMALL_KERNEL_B0_RT	FUNC_OFFSET(cgemm_small_kernel_b0_rt)
+#define CGEMM_SMALL_KERNEL_B0_RR	FUNC_OFFSET(cgemm_small_kernel_b0_rr)
+#define CGEMM_SMALL_KERNEL_B0_RC	FUNC_OFFSET(cgemm_small_kernel_b0_rc)
+
+#define CGEMM_SMALL_KERNEL_B0_CN	FUNC_OFFSET(cgemm_small_kernel_b0_cn)
+#define CGEMM_SMALL_KERNEL_B0_CT	FUNC_OFFSET(cgemm_small_kernel_b0_ct)
+#define CGEMM_SMALL_KERNEL_B0_CR	FUNC_OFFSET(cgemm_small_kernel_b0_cr)
+#define CGEMM_SMALL_KERNEL_B0_CC	FUNC_OFFSET(cgemm_small_kernel_b0_cc)
+
+
 #define	CGEMM_NN		cgemm_nn
 #define	CGEMM_CN		cgemm_cn
 #define	CGEMM_TN		cgemm_tn
diff --git a/common_d.h b/common_d.h
index bb85f1232..6f4bb2ded 100644
--- a/common_d.h
+++ b/common_d.h
@@ -159,16 +159,6 @@
 
 #define DGEMM_SMALL_MATRIX_PERMIT	dgemm_small_matrix_permit
 
-#define DGEMM_SMALL_KERNEL_NN   dgemm_small_kernel_nn
-#define DGEMM_SMALL_KERNEL_NT   dgemm_small_kernel_nt
-#define DGEMM_SMALL_KERNEL_TN   dgemm_small_kernel_tn
-#define DGEMM_SMALL_KERNEL_TT   dgemm_small_kernel_tt
-
-#define DGEMM_SMALL_KERNEL_B0_NN   dgemm_small_kernel_b0_nn
-#define DGEMM_SMALL_KERNEL_B0_NT   dgemm_small_kernel_b0_nt
-#define DGEMM_SMALL_KERNEL_B0_TN   dgemm_small_kernel_b0_tn
-#define DGEMM_SMALL_KERNEL_B0_TT   dgemm_small_kernel_b0_tt
-
 #else
 
 #define	DAMAX_K			gotoblas -> damax_k
@@ -293,8 +283,21 @@
 
 #define DGEADD_K                gotoblas -> dgeadd_k 
 
+#define DGEMM_SMALL_MATRIX_PERMIT	gotoblas -> dgemm_small_matrix_permit
+
 #endif
 
+#define DGEMM_SMALL_KERNEL_NN		FUNC_OFFSET(dgemm_small_kernel_nn)
+#define DGEMM_SMALL_KERNEL_NT		FUNC_OFFSET(dgemm_small_kernel_nt)
+#define DGEMM_SMALL_KERNEL_TN		FUNC_OFFSET(dgemm_small_kernel_tn)
+#define DGEMM_SMALL_KERNEL_TT		FUNC_OFFSET(dgemm_small_kernel_tt)
+
+#define DGEMM_SMALL_KERNEL_B0_NN	FUNC_OFFSET(dgemm_small_kernel_b0_nn)
+#define DGEMM_SMALL_KERNEL_B0_NT	FUNC_OFFSET(dgemm_small_kernel_b0_nt)
+#define DGEMM_SMALL_KERNEL_B0_TN	FUNC_OFFSET(dgemm_small_kernel_b0_tn)
+#define DGEMM_SMALL_KERNEL_B0_TT	FUNC_OFFSET(dgemm_small_kernel_b0_tt)
+
+
 #define	DGEMM_NN		dgemm_nn
 #define	DGEMM_CN		dgemm_tn
 #define	DGEMM_TN		dgemm_tn
diff --git a/common_param.h b/common_param.h
index 3e3ae06f8..7e8bea4fe 100644
--- a/common_param.h
+++ b/common_param.h
@@ -207,6 +207,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
   int    (*sgemm_otcopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
 #endif
 #ifdef BUILD_SINGLE  
+#ifdef SMALL_MATRIX_OPT
+  int    (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
+
+  int    (*sgemm_small_kernel_nn    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+  int    (*sgemm_small_kernel_nt    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+  int    (*sgemm_small_kernel_tn    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+  int    (*sgemm_small_kernel_tt    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+
+  int    (*sgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+  int    (*sgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+  int    (*sgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+  int    (*sgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+#endif
+
   int    (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
   int    (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
   int    (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
@@ -314,6 +328,19 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
   int    (*dgemm_otcopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
 #endif
 #ifdef BUILD_DOUBLE
+#ifdef SMALL_MATRIX_OPT
+  int    (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta);
+
+  int    (*dgemm_small_kernel_nn    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+  int    (*dgemm_small_kernel_nt    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+  int    (*dgemm_small_kernel_tn    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+  int    (*dgemm_small_kernel_tt    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+
+  int    (*dgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+  int    (*dgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+  int    (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+  int    (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+#endif
   int    (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
   int    (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
   int    (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
@@ -513,6 +540,50 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
   int    (*cgemm_oncopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
   int    (*cgemm_otcopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
 
+#ifdef SMALL_MATRIX_OPT
+  int    (*cgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1);
+
+  int    (*cgemm_small_kernel_nn    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_nt    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_nr    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_nc    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_tn    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_tt    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_tr    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_tc    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_rn    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_rt    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_rr    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_rc    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_cn    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_ct    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_cr    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_cc    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+#endif
+
   int    (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
   int    (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
   int    (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
@@ -679,6 +750,50 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
   int    (*zgemm_oncopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
   int    (*zgemm_otcopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
 
+#ifdef SMALL_MATRIX_OPT
+  int    (*zgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1);
+
+  int    (*zgemm_small_kernel_nn    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_nt    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_nr    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_nc    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_tn    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_tt    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_tr    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_tc    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_rn    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_rt    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_rr    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_rc    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_cn    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_ct    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_cr    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_cc    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+#endif
+
   int    (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
   int    (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
   int    (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
@@ -1069,6 +1184,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
 
 extern gotoblas_t *gotoblas;
 
+#define FUNC_OFFSET(func)	(size_t)(&((gotoblas_t *)NULL)->func)
+
 #define DTB_ENTRIES  gotoblas -> dtb_entries
 #define GEMM_OFFSET_A	gotoblas -> offsetA
 #define GEMM_OFFSET_B	gotoblas -> offsetB
@@ -1174,6 +1291,8 @@ extern gotoblas_t *gotoblas;
 
 #else
 
+#define FUNC_OFFSET(func)	(size_t)(func)
+
 #define DTB_ENTRIES  DTB_DEFAULT_ENTRIES
 
 #define GEMM_OFFSET_A	GEMM_DEFAULT_OFFSET_A
diff --git a/common_s.h b/common_s.h
index 5851014cf..fdd80b62f 100644
--- a/common_s.h
+++ b/common_s.h
@@ -166,16 +166,6 @@
 
 #define SGEMM_SMALL_MATRIX_PERMIT	sgemm_small_matrix_permit
 
-#define SGEMM_SMALL_KERNEL_NN   sgemm_small_kernel_nn
-#define SGEMM_SMALL_KERNEL_NT   sgemm_small_kernel_nt
-#define SGEMM_SMALL_KERNEL_TN   sgemm_small_kernel_tn
-#define SGEMM_SMALL_KERNEL_TT   sgemm_small_kernel_tt
-
-#define SGEMM_SMALL_KERNEL_B0_NN   sgemm_small_kernel_b0_nn
-#define SGEMM_SMALL_KERNEL_B0_NT   sgemm_small_kernel_b0_nt
-#define SGEMM_SMALL_KERNEL_B0_TN   sgemm_small_kernel_b0_tn
-#define SGEMM_SMALL_KERNEL_B0_TT   sgemm_small_kernel_b0_tt
-
 #else
 
 #define	SAMAX_K			gotoblas -> samax_k
@@ -311,8 +301,21 @@
 
 #define SGEADD_K                gotoblas -> sgeadd_k 
 
+#define SGEMM_SMALL_MATRIX_PERMIT	gotoblas -> sgemm_small_matrix_permit
+
 #endif
 
+#define SGEMM_SMALL_KERNEL_NN		FUNC_OFFSET(sgemm_small_kernel_nn)
+#define SGEMM_SMALL_KERNEL_NT		FUNC_OFFSET(sgemm_small_kernel_nt)
+#define SGEMM_SMALL_KERNEL_TN		FUNC_OFFSET(sgemm_small_kernel_tn)
+#define SGEMM_SMALL_KERNEL_TT		FUNC_OFFSET(sgemm_small_kernel_tt)
+
+#define SGEMM_SMALL_KERNEL_B0_NN	FUNC_OFFSET(sgemm_small_kernel_b0_nn)
+#define SGEMM_SMALL_KERNEL_B0_NT	FUNC_OFFSET(sgemm_small_kernel_b0_nt)
+#define SGEMM_SMALL_KERNEL_B0_TN	FUNC_OFFSET(sgemm_small_kernel_b0_tn)
+#define SGEMM_SMALL_KERNEL_B0_TT	FUNC_OFFSET(sgemm_small_kernel_b0_tt)
+
+
 #define	SGEMM_NN		sgemm_nn
 #define	SGEMM_CN		sgemm_tn
 #define	SGEMM_TN		sgemm_tn
diff --git a/common_z.h b/common_z.h
index 6088260a1..c12d71b39 100644
--- a/common_z.h
+++ b/common_z.h
@@ -234,46 +234,6 @@
 
 #define ZGEMM_SMALL_MATRIX_PERMIT	zgemm_small_matrix_permit
 
-#define ZGEMM_SMALL_KERNEL_NN   zgemm_small_kernel_nn
-#define ZGEMM_SMALL_KERNEL_NT   zgemm_small_kernel_nt
-#define ZGEMM_SMALL_KERNEL_NR   zgemm_small_kernel_nr
-#define ZGEMM_SMALL_KERNEL_NC   zgemm_small_kernel_nc
-
-#define ZGEMM_SMALL_KERNEL_TN   zgemm_small_kernel_tn
-#define ZGEMM_SMALL_KERNEL_TT   zgemm_small_kernel_tt
-#define ZGEMM_SMALL_KERNEL_TR   zgemm_small_kernel_tr
-#define ZGEMM_SMALL_KERNEL_TC   zgemm_small_kernel_tc
-
-#define ZGEMM_SMALL_KERNEL_RN   zgemm_small_kernel_rn
-#define ZGEMM_SMALL_KERNEL_RT   zgemm_small_kernel_rt
-#define ZGEMM_SMALL_KERNEL_RR   zgemm_small_kernel_rr
-#define ZGEMM_SMALL_KERNEL_RC   zgemm_small_kernel_rc
-
-#define ZGEMM_SMALL_KERNEL_CN   zgemm_small_kernel_cn
-#define ZGEMM_SMALL_KERNEL_CT   zgemm_small_kernel_ct
-#define ZGEMM_SMALL_KERNEL_CR   zgemm_small_kernel_cr
-#define ZGEMM_SMALL_KERNEL_CC   zgemm_small_kernel_cc
-
-#define ZGEMM_SMALL_KERNEL_B0_NN   zgemm_small_kernel_b0_nn
-#define ZGEMM_SMALL_KERNEL_B0_NT   zgemm_small_kernel_b0_nt
-#define ZGEMM_SMALL_KERNEL_B0_NR   zgemm_small_kernel_b0_nr
-#define ZGEMM_SMALL_KERNEL_B0_NC   zgemm_small_kernel_b0_nc
-
-#define ZGEMM_SMALL_KERNEL_B0_TN   zgemm_small_kernel_b0_tn
-#define ZGEMM_SMALL_KERNEL_B0_TT   zgemm_small_kernel_b0_tt
-#define ZGEMM_SMALL_KERNEL_B0_TR   zgemm_small_kernel_b0_tr
-#define ZGEMM_SMALL_KERNEL_B0_TC   zgemm_small_kernel_b0_tc
-
-#define ZGEMM_SMALL_KERNEL_B0_RN   zgemm_small_kernel_b0_rn
-#define ZGEMM_SMALL_KERNEL_B0_RT   zgemm_small_kernel_b0_rt
-#define ZGEMM_SMALL_KERNEL_B0_RR   zgemm_small_kernel_b0_rr
-#define ZGEMM_SMALL_KERNEL_B0_RC   zgemm_small_kernel_b0_rc
-
-#define ZGEMM_SMALL_KERNEL_B0_CN   zgemm_small_kernel_b0_cn
-#define ZGEMM_SMALL_KERNEL_B0_CT   zgemm_small_kernel_b0_ct
-#define ZGEMM_SMALL_KERNEL_B0_CR   zgemm_small_kernel_b0_cr
-#define ZGEMM_SMALL_KERNEL_B0_CC   zgemm_small_kernel_b0_cc
-
 #else
 
 #define	ZAMAX_K			gotoblas -> zamax_k
@@ -468,8 +428,51 @@
 
 #define ZGEADD_K                gotoblas -> zgeadd_k
 
+#define ZGEMM_SMALL_MATRIX_PERMIT	gotoblas -> zgemm_small_matrix_permit
+
 #endif
 
+#define ZGEMM_SMALL_KERNEL_NN		FUNC_OFFSET(zgemm_small_kernel_nn)
+#define ZGEMM_SMALL_KERNEL_NT		FUNC_OFFSET(zgemm_small_kernel_nt)
+#define ZGEMM_SMALL_KERNEL_NR		FUNC_OFFSET(zgemm_small_kernel_nr)
+#define ZGEMM_SMALL_KERNEL_NC		FUNC_OFFSET(zgemm_small_kernel_nc)
+
+#define ZGEMM_SMALL_KERNEL_TN		FUNC_OFFSET(zgemm_small_kernel_tn)
+#define ZGEMM_SMALL_KERNEL_TT		FUNC_OFFSET(zgemm_small_kernel_tt)
+#define ZGEMM_SMALL_KERNEL_TR		FUNC_OFFSET(zgemm_small_kernel_tr)
+#define ZGEMM_SMALL_KERNEL_TC		FUNC_OFFSET(zgemm_small_kernel_tc)
+
+#define ZGEMM_SMALL_KERNEL_RN		FUNC_OFFSET(zgemm_small_kernel_rn)
+#define ZGEMM_SMALL_KERNEL_RT		FUNC_OFFSET(zgemm_small_kernel_rt)
+#define ZGEMM_SMALL_KERNEL_RR		FUNC_OFFSET(zgemm_small_kernel_rr)
+#define ZGEMM_SMALL_KERNEL_RC		FUNC_OFFSET(zgemm_small_kernel_rc)
+
+#define ZGEMM_SMALL_KERNEL_CN		FUNC_OFFSET(zgemm_small_kernel_cn)
+#define ZGEMM_SMALL_KERNEL_CT		FUNC_OFFSET(zgemm_small_kernel_ct)
+#define ZGEMM_SMALL_KERNEL_CR		FUNC_OFFSET(zgemm_small_kernel_cr)
+#define ZGEMM_SMALL_KERNEL_CC		FUNC_OFFSET(zgemm_small_kernel_cc)
+
+#define ZGEMM_SMALL_KERNEL_B0_NN	FUNC_OFFSET(zgemm_small_kernel_b0_nn)
+#define ZGEMM_SMALL_KERNEL_B0_NT	FUNC_OFFSET(zgemm_small_kernel_b0_nt)
+#define ZGEMM_SMALL_KERNEL_B0_NR	FUNC_OFFSET(zgemm_small_kernel_b0_nr)
+#define ZGEMM_SMALL_KERNEL_B0_NC	FUNC_OFFSET(zgemm_small_kernel_b0_nc)
+
+#define ZGEMM_SMALL_KERNEL_B0_TN	FUNC_OFFSET(zgemm_small_kernel_b0_tn)
+#define ZGEMM_SMALL_KERNEL_B0_TT	FUNC_OFFSET(zgemm_small_kernel_b0_tt)
+#define ZGEMM_SMALL_KERNEL_B0_TR	FUNC_OFFSET(zgemm_small_kernel_b0_tr)
+#define ZGEMM_SMALL_KERNEL_B0_TC	FUNC_OFFSET(zgemm_small_kernel_b0_tc)
+
+#define ZGEMM_SMALL_KERNEL_B0_RN	FUNC_OFFSET(zgemm_small_kernel_b0_rn)
+#define ZGEMM_SMALL_KERNEL_B0_RT	FUNC_OFFSET(zgemm_small_kernel_b0_rt)
+#define ZGEMM_SMALL_KERNEL_B0_RR	FUNC_OFFSET(zgemm_small_kernel_b0_rr)
+#define ZGEMM_SMALL_KERNEL_B0_RC	FUNC_OFFSET(zgemm_small_kernel_b0_rc)
+
+#define ZGEMM_SMALL_KERNEL_B0_CN	FUNC_OFFSET(zgemm_small_kernel_b0_cn)
+#define ZGEMM_SMALL_KERNEL_B0_CT	FUNC_OFFSET(zgemm_small_kernel_b0_ct)
+#define ZGEMM_SMALL_KERNEL_B0_CR	FUNC_OFFSET(zgemm_small_kernel_b0_cr)
+#define ZGEMM_SMALL_KERNEL_B0_CC	FUNC_OFFSET(zgemm_small_kernel_b0_cc)
+
+
 #define	ZGEMM_NN		zgemm_nn
 #define	ZGEMM_CN		zgemm_cn
 #define	ZGEMM_TN		zgemm_tn
diff --git a/interface/gemm.c b/interface/gemm.c
index ad8780668..f4b9f1537 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -106,25 +106,34 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
 };
 
 #ifdef SMALL_MATRIX_OPT
+#ifndef DYNAMIC_ARCH
+#define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx]))
+#else
+#define SMALL_KERNEL_ADDR(table, idx) ((void *)(*(uintptr_t *)((char *)gotoblas + (size_t)(table[idx]))))
+#endif
+
 
 #ifndef COMPLEX
-static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
+static size_t gemm_small_kernel[] = {
 #ifndef GEMM3M
-	GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL,
-	GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL,
+	GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0,
+	GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0,
 #endif
 };
 
-static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
+
+static size_t gemm_small_kernel_b0[] = {
 #ifndef GEMM3M
-	GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, NULL, NULL,
-	GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, NULL, NULL,
+	GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0,
+	GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0,
 #endif
 };
 
+#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx))
+#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx))
 #else
 
-static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG) = {
+static size_t zgemm_small_kernel[] = {
 #ifndef GEMM3M
 	GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN,
 	GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT,
@@ -133,7 +142,7 @@ static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLO
 #endif
 };
 
-static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
+static size_t zgemm_small_kernel_b0[] = {
 #ifndef GEMM3M
 	GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN,
 	GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT,
@@ -141,6 +150,9 @@ static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLA
 	GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC,
 #endif
 };
+
+#define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx))
+#define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx))
 #endif
 #endif
 
@@ -163,7 +175,7 @@ void NAME(char *TRANSA, char *TRANSB,
   IFLOAT *buffer;
   IFLOAT *sa, *sb;
 
-#if defined (SMP) || defined(SMALL_MATRIX_OPT)
+#ifdef SMP
   double MNK;
 #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
 #ifndef COMPLEX
@@ -287,11 +299,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
   XFLOAT *buffer;
   XFLOAT *sa, *sb;
 
-#if defined (SMP) || defined(SMALL_MATRIX_OPT)
-  double MNK;
-#endif
-
 #ifdef SMP
+  double MNK;
 #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
 #ifndef COMPLEX
 #ifdef XDOUBLE
@@ -459,32 +468,27 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
   FUNCTION_PROFILE_START();
 
-#if defined(SMP) || defined(SMALL_MATRIX_OPT)
-  MNK = (double) args.m * (double) args.n * (double) args.k;
-#endif
-
 #ifdef SMALL_MATRIX_OPT
 #if !defined(COMPLEX)
   if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){
 	  if(*(FLOAT *)(args.beta) == 0.0){
-		  (gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc);
+		(GEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc);
 	  }else{
-		  (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
+		(GEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
 	  }
 	  return;
   }
 #else
   if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, alpha[0], alpha[1], beta[0], beta[1])){
 	  if(beta[0] == 0.0 && beta[1] == 0.0){
-		  (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc);
+		(ZGEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc);
 	  }else{
-		  (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc);
+		(ZGEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc);
 	  }
 	  return;
   }
 #endif
 #endif
-  
 
   buffer = (XFLOAT *)blas_memory_alloc(0);
 
@@ -497,7 +501,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
   mode |= (transb << BLAS_TRANSB_SHIFT);
 #endif
 
-
+  MNK = (double) args.m * (double) args.n * (double) args.k;
   if ( MNK <= (SMP_THRESHOLD_MIN  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
 	args.nthreads = 1;
   else
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 1e846a61c..f303d0dc6 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -171,6 +171,14 @@ gotoblas_t TABLE_NAME = {
   sgemm_oncopyTS, sgemm_otcopyTS,
 #endif
 
+#if BUILD_SINGLE == 1
+#ifdef SMALL_MATRIX_OPT
+  sgemm_small_matrix_permitTS,
+  sgemm_small_kernel_nnTS, sgemm_small_kernel_ntTS, sgemm_small_kernel_tnTS, sgemm_small_kernel_ttTS,
+  sgemm_small_kernel_b0_nnTS, sgemm_small_kernel_b0_ntTS, sgemm_small_kernel_b0_tnTS, sgemm_small_kernel_b0_ttTS,
+#endif
+#endif
+
 #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) 
   strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS,
 #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
@@ -257,6 +265,11 @@ gotoblas_t TABLE_NAME = {
 #endif
 
 #if  (BUILD_DOUBLE==1)  
+#ifdef SMALL_MATRIX_OPT
+  dgemm_small_matrix_permitTS,
+  dgemm_small_kernel_nnTS, dgemm_small_kernel_ntTS, dgemm_small_kernel_tnTS, dgemm_small_kernel_ttTS,
+  dgemm_small_kernel_b0_nnTS, dgemm_small_kernel_b0_ntTS, dgemm_small_kernel_b0_tnTS, dgemm_small_kernel_b0_ttTS,
+#endif
   dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS,
 #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
   dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS,
@@ -389,6 +402,18 @@ gotoblas_t TABLE_NAME = {
 #endif
   cgemm_oncopyTS, cgemm_otcopyTS,
 
+#ifdef SMALL_MATRIX_OPT
+  cgemm_small_matrix_permitTS,
+  cgemm_small_kernel_nnTS, cgemm_small_kernel_ntTS, cgemm_small_kernel_nrTS, cgemm_small_kernel_ncTS,
+  cgemm_small_kernel_tnTS, cgemm_small_kernel_ttTS, cgemm_small_kernel_trTS, cgemm_small_kernel_tcTS,
+  cgemm_small_kernel_rnTS, cgemm_small_kernel_rtTS, cgemm_small_kernel_rrTS, cgemm_small_kernel_rcTS,
+  cgemm_small_kernel_cnTS, cgemm_small_kernel_ctTS, cgemm_small_kernel_crTS, cgemm_small_kernel_ccTS,
+  cgemm_small_kernel_b0_nnTS, cgemm_small_kernel_b0_ntTS, cgemm_small_kernel_b0_nrTS, cgemm_small_kernel_b0_ncTS,
+  cgemm_small_kernel_b0_tnTS, cgemm_small_kernel_b0_ttTS, cgemm_small_kernel_b0_trTS, cgemm_small_kernel_b0_tcTS,
+  cgemm_small_kernel_b0_rnTS, cgemm_small_kernel_b0_rtTS, cgemm_small_kernel_b0_rrTS, cgemm_small_kernel_b0_rcTS,
+  cgemm_small_kernel_b0_cnTS, cgemm_small_kernel_b0_ctTS, cgemm_small_kernel_b0_crTS, cgemm_small_kernel_b0_ccTS,
+#endif
+
   ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS,
   ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS,
 
@@ -533,6 +558,18 @@ gotoblas_t TABLE_NAME = {
 #endif
   zgemm_oncopyTS, zgemm_otcopyTS,
 
+#ifdef SMALL_MATRIX_OPT
+  zgemm_small_matrix_permitTS,
+  zgemm_small_kernel_nnTS, zgemm_small_kernel_ntTS, zgemm_small_kernel_nrTS, zgemm_small_kernel_ncTS,
+  zgemm_small_kernel_tnTS, zgemm_small_kernel_ttTS, zgemm_small_kernel_trTS, zgemm_small_kernel_tcTS,
+  zgemm_small_kernel_rnTS, zgemm_small_kernel_rtTS, zgemm_small_kernel_rrTS, zgemm_small_kernel_rcTS,
+  zgemm_small_kernel_cnTS, zgemm_small_kernel_ctTS, zgemm_small_kernel_crTS, zgemm_small_kernel_ccTS,
+  zgemm_small_kernel_b0_nnTS, zgemm_small_kernel_b0_ntTS, zgemm_small_kernel_b0_nrTS, zgemm_small_kernel_b0_ncTS,
+  zgemm_small_kernel_b0_tnTS, zgemm_small_kernel_b0_ttTS, zgemm_small_kernel_b0_trTS, zgemm_small_kernel_b0_tcTS,
+  zgemm_small_kernel_b0_rnTS, zgemm_small_kernel_b0_rtTS, zgemm_small_kernel_b0_rrTS, zgemm_small_kernel_b0_rcTS,
+  zgemm_small_kernel_b0_cnTS, zgemm_small_kernel_b0_ctTS, zgemm_small_kernel_b0_crTS, zgemm_small_kernel_b0_ccTS,
+#endif
+
   ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS,
   ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS,
 

From fee5abd84bf01aba7a2223f7264fcc7da66d1b20 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Wed, 4 Aug 2021 08:50:15 +0000
Subject: [PATCH 054/143] Small Matrix: support cmake build

---
 cmake/system.cmake    |   4 ++
 kernel/CMakeLists.txt | 110 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index f8bd6678e..e51dc1fdc 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -258,6 +258,10 @@ if (NEED_PIC)
   endif()
 endif ()
 
+if (SMALL_MATRIX_OPT)
+  set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT")
+endif ()
+
 if (DYNAMIC_ARCH)
   if (X86 OR X86_64 OR ARM64 OR PPC)
     set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index f0793bdef..769a73b91 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -458,7 +458,117 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
       GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type})
       GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type})
 
+      if (NOT DEFINED ${float_char}GEMM_SMALL_M_PERMIT)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_M_PERMIT ../generic/zgemm_small_matrix_permit.c)
+        else ()
+          set(${float_char}GEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_NN)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_NN ../generic/zgemm_small_matrix_kernel_nn.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_NT)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_NT ../generic/zgemm_small_matrix_kernel_nt.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_TN)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_TN ../generic/zgemm_small_matrix_kernel_tn.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_TT)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_TT ../generic/zgemm_small_matrix_kernel_tt.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NN)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_b0_nn.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_b0_nn.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NT)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_b0_nt.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_b0_nt.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TN)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_b0_tn.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_b0_tn.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TT)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_b0_tt.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_b0_tt.c)
+        endif ()
+      endif ()
 
+      if (SMALL_MATRIX_OPT)
+        GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false ${float_type})
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NN" "gemm_small_kernel_nn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NR" "gemm_small_kernel_nr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RN" "gemm_small_kernel_rn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RR" "gemm_small_kernel_rr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NT" "gemm_small_kernel_nt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NC" "gemm_small_kernel_nc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RT" "gemm_small_kernel_rt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RC" "gemm_small_kernel_rc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TN" "gemm_small_kernel_tn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TR" "gemm_small_kernel_tr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CN" "gemm_small_kernel_cn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CR" "gemm_small_kernel_cr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TT" "gemm_small_kernel_tt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR" "gemm_small_kernel_b0_nr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN" "gemm_small_kernel_b0_rn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR" "gemm_small_kernel_b0_rr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC" "gemm_small_kernel_b0_nc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT" "gemm_small_kernel_b0_rt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC" "gemm_small_kernel_b0_rc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR" "gemm_small_kernel_b0_tr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN" "gemm_small_kernel_b0_cn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR" "gemm_small_kernel_b0_cr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC" "gemm_small_kernel_b0_tc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT" "gemm_small_kernel_b0_ct" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC" "gemm_small_kernel_b0_cc" false "" "" false ${float_type})
+
+        else ()
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
+        endif ()
+      endif ()
 
       if (NOT DEFINED ${float_char}OMATCOPY_CN)
         if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")

From aa50185647ba6966dcdb731372af2ecd5ae3b1d4 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 5 Aug 2021 02:45:53 +0000
Subject: [PATCH 055/143] Small Matrix: better handle with GEMM3M marco

---
 interface/gemm.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/interface/gemm.c b/interface/gemm.c
index f4b9f1537..775f654c3 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -105,6 +105,7 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
 #endif
 };
 
+#ifndef GEMM3M
 #ifdef SMALL_MATRIX_OPT
 #ifndef DYNAMIC_ARCH
 #define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx]))
@@ -115,18 +116,14 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
 
 #ifndef COMPLEX
 static size_t gemm_small_kernel[] = {
-#ifndef GEMM3M
 	GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0,
 	GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0,
-#endif
 };
 
 
 static size_t gemm_small_kernel_b0[] = {
-#ifndef GEMM3M
 	GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0,
 	GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0,
-#endif
 };
 
 #define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx))
@@ -134,27 +131,24 @@ static size_t gemm_small_kernel_b0[] = {
 #else
 
 static size_t zgemm_small_kernel[] = {
-#ifndef GEMM3M
 	GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN,
 	GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT,
 	GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR,
 	GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC,
-#endif
 };
 
 static size_t zgemm_small_kernel_b0[] = {
-#ifndef GEMM3M
 	GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN,
 	GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT,
 	GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR,
 	GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC,
-#endif
 };
 
 #define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx))
 #define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx))
 #endif
 #endif
+#endif
 
 #ifndef CBLAS
 
@@ -468,6 +462,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
   FUNCTION_PROFILE_START();
 
+#ifndef GEMM3M
 #ifdef SMALL_MATRIX_OPT
 #if !defined(COMPLEX)
   if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){
@@ -488,6 +483,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 	  return;
   }
 #endif
+#endif
 #endif
 
   buffer = (XFLOAT *)blas_memory_alloc(0);

From 76ea8db4da1a651bb4de744162de1ecfc6762e7c Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 5 Aug 2021 02:57:58 +0000
Subject: [PATCH 056/143] Small Matrix: enable by default for x86_64 arch

If no customized GEMM_SMALL_M_PERMIT kernel defined, it will just by pass to normal path.
---
 Makefile.system    | 3 +++
 cmake/system.cmake | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index 20d8d2f2a..20db80d07 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -245,6 +245,9 @@ ONLY_CBLAS = 0
 endif
 
 #For small matrix optimization
+ifeq ($(ARCH), x86_64)
+SMALL_MATRIX_OPT = 1
+endif
 ifeq ($(SMALL_MATRIX_OPT), 1)
 CCOMMON_OPT += -DSMALL_MATRIX_OPT
 endif
diff --git a/cmake/system.cmake b/cmake/system.cmake
index e51dc1fdc..7d2672998 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -258,6 +258,9 @@ if (NEED_PIC)
   endif()
 endif ()
 
+if (X86_64)
+  set(SMALL_MATRIX_OPT TRUE)
+endif ()
 if (SMALL_MATRIX_OPT)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT")
 endif ()

From 5d86becdaec262e8a2869ce909d94bec881fbfb6 Mon Sep 17 00:00:00 2001
From: "Chen, Guobing" <guobing.chen@intel.com>
Date: Thu, 5 Aug 2021 11:11:14 +0800
Subject: [PATCH 057/143] Add all SBGEMM kernels for IA AVX512-BF16 based
 platforms

Added all SBGEMM kernels including NN/NT/TN/TT for both ColMajor and
RowMajor, based on AVX512-BF16 ISA set on IA.

Signed-off-by: Chen, Guobing <guobing.chen@intel.com>
---
 kernel/x86_64/bf16_common_macros.h            |   52 +
 .../x86_64/sbgemm_block_microk_cooperlake.c   | 2024 ++++++++++++++---
 .../sbgemm_microk_cooperlake_template.c       | 1737 +++++++++++---
 3 files changed, 3268 insertions(+), 545 deletions(-)

diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h
index 1014ecc4d..78db7abb2 100644
--- a/kernel/x86_64/bf16_common_macros.h
+++ b/kernel/x86_64/bf16_common_macros.h
@@ -29,6 +29,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <immintrin.h>
 
+#define _MM512_BROADCASTD_EPI32(addr, zmm)             \
+                    __asm__ ("vpbroadcastd (%1), %0;"  \
+                            : "=v" (zmm)               \
+                            : "r"  (addr) )
+
+#define PREFETCH_T0(addr)             \
+                    __asm__ ("prefetcht0 (%0);"  \
+                            :                    \
+                            : "r"  (addr) )
+
 #define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512)   \
     reg256##_0 = _mm512_castps512_ps256(reg512##_0);  \
     reg256##_1 = _mm512_castps512_ps256(reg512##_1);
@@ -721,6 +731,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     _mm_mask_storeu_ps(targetAddr, mask, regResult);
 
 
+/* Store 16 (result + y) to y
+*/
+#define STORE16_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr)          \
+    regResult = _mm512_add_ps(regResult, _mm512_loadu_ps(targetAddr));  \
+    _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 (result + y) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask)           \
+    regResult = _mm512_add_ps(regResult, _mm512_maskz_loadu_ps(mask, targetAddr));  \
+    _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 (result + y) to y
+*/
+#define STORE8_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr)           \
+    regResult = _mm256_add_ps(regResult, _mm256_loadu_ps(targetAddr));  \
+    _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 (result + y) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask)            \
+    regResult = _mm256_add_ps(regResult, _mm256_maskz_loadu_ps(mask, targetAddr));  \
+    _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 (result + y) to y
+*/
+#define STORE4_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr)     \
+    regResult = _mm_add_ps(regResult, _mm_loadu_ps(targetAddr));  \
+    _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 (result + y) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask)      \
+    regResult = _mm_add_ps(regResult, _mm_maskz_loadu_ps(mask, targetAddr));  \
+    _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+
 /* Store 16 (alpha * result) to y
 */
 #define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr)  \
diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c
index 2376fed02..147c5ebdd 100644
--- a/kernel/x86_64/sbgemm_block_microk_cooperlake.c
+++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c
@@ -1,4 +1,4 @@
-#include "sbgemm.h"
+//#include "sbgemm.h"
 
 #include <immintrin.h>
 // Walk around those intrinsics that missed by compiler
@@ -7,420 +7,1878 @@
 #define MM256_STOREU_EPI16(addr, reg)  \
             _mm256_mask_storeu_epi16((addr), ~0, (reg))
 
-#include <stdio.h>
-void print_block(BLASLONG m, BLASLONG n, bfloat16 * mat)
-{
-    printf("---- BLOCK %ld x %ld ----\n", m, n);
-    for (BLASLONG i=0; i<m; i++) {
-        for (BLASLONG j=0; j<n; j++) {
-            printf("%-4X  ", *(mat + i*n +j));
-        }
-        printf("\n");
-    }
-    printf("---- End of BLOCK ----\n");
-}
-
-void COL_MAJOR_INCOPY_KERNEL_Kx32(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+// INCOPY Kernel, 16<M<=32, k can be any number
+void COL_MAJOR_INCOPY_KERNEL_Kx32(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
 {
     BLASLONG tag_k_2x = k & (~1);
+    unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-m));
 
     __m512i array512_0, array512_1, array512_2, array512_3;
 
-    BLASLONG idx_src_base0, idx_src_base1;
-    BLASLONG idx_target_base0, idx_target_base1;
+    bfloat16 * src_addr0, * src_addr1;
+    bfloat16 * dst_addr0, * dst_addr1;
 
     BLASLONG LDA_2x = 2*lda;
     BLASLONG BF16_BLOCK_T_M_2x = 2*32;
-    idx_src_base0 = 0;
-    idx_src_base1 = lda;
-    idx_target_base0 = 0;
-    idx_target_base1 = 32;
-    for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
-        array512_0 = _mm512_loadu_si512(&A[idx_src_base0]);
-        array512_1 = _mm512_loadu_si512(&A[idx_src_base1]);
-        array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1);
-        array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1);
-        _mm512_storeu_si512(&block_A[idx_target_base0], array512_2);
-        _mm512_storeu_si512(&block_A[idx_target_base1], array512_3);
-
-        idx_src_base0 += LDA_2x;
-        idx_src_base1 += LDA_2x;
-        idx_target_base0 += BF16_BLOCK_T_M_2x;
-        idx_target_base1 += BF16_BLOCK_T_M_2x;
-    }
-
-    if (tag_k_2x != k) {
-        __m512i ZERO512 = _mm512_setzero_si512();
-        array512_0 = _mm512_loadu_si512(&A[idx_src_base0]);
-        array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512);
-        array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512);
-        _mm512_storeu_si512(&block_A[idx_target_base0], array512_2);
-        _mm512_storeu_si512(&block_A[idx_target_base1], array512_3);
-   }
-
-#ifdef DEBUG_PROFILE
-   print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A);
-#endif
-}
-
-void COL_MAJOR_INCOPY_KERNEL_Kx32m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
-{
-    BLASLONG tag_k_2x = k & (~1);
-    unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-m));
-    __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
-
-    __m512i array512_0, array512_1, array512_2, array512_3;
 
-    BLASLONG idx_src_base0, idx_src_base1;
-    BLASLONG idx_target_base0, idx_target_base1;
+    src_addr0 = A;
+    src_addr1 = A + lda;
+    dst_addr0 = block_A;
+    dst_addr1 = block_A + 32;
 
-    BLASLONG LDA_2x = 2*lda;
-    BLASLONG BF16_BLOCK_T_M_2x = 2*32;
-    idx_src_base0 = 0;
-    idx_src_base1 = lda;
-    idx_target_base0 = 0;
-    idx_target_base1 = 32;
     for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
-        array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]);
-        array512_1 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]);
+        array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0);
+        array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1);
         array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1);
         array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1);
-        _mm512_storeu_si512(&block_A[idx_target_base0], array512_2);
-        _mm512_storeu_si512(&block_A[idx_target_base1], array512_3);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
 
-        idx_src_base0 += LDA_2x;
-        idx_src_base1 += LDA_2x;
-        idx_target_base0 += BF16_BLOCK_T_M_2x;
-        idx_target_base1 += BF16_BLOCK_T_M_2x;
+        src_addr0 += LDA_2x;
+        src_addr1 += LDA_2x;
+        dst_addr0 += BF16_BLOCK_T_M_2x;
+        dst_addr1 += BF16_BLOCK_T_M_2x;
     }
 
     if (tag_k_2x != k) {
         __m512i ZERO512 = _mm512_setzero_si512();
-        array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]);
+        array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0);
         array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512);
         array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512);
-        _mm512_storeu_si512(&block_A[idx_target_base0], array512_2);
-        _mm512_storeu_si512(&block_A[idx_target_base1], array512_3);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
     }
-
-#ifdef DEBUG_PROFILE
-   print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A);
-#endif
 }
 
+// INCOPY Kernel, 0<M<=16, k can be any number
 void COL_MAJOR_INCOPY_KERNEL_Kx16(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
 {
     BLASLONG tag_k_2x = k & (~1);
+    unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m));
 
     __m256i array256_0, array256_1, array256_2, array256_3;
 
-    BLASLONG idx_src_base0, idx_src_base1;
-    BLASLONG idx_target_base0;
+    bfloat16 * src_addr0, * src_addr1;
+    bfloat16 * dst_addr0;
 
     BLASLONG LDA_2x = 2*lda;
-    idx_src_base0 = 0;
-    idx_src_base1 = lda;
-    idx_target_base0 = 0;
+
+    src_addr0 = A;
+    src_addr1 = A + lda;
+    dst_addr0 = block_A;
+
     for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
-        array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]);
-        array256_1 = MM256_LOADU_EPI16(&A[idx_src_base1]);
+        array256_0 = _mm256_maskz_loadu_epi16(tail_mask, src_addr0);
+        array256_1 = _mm256_maskz_loadu_epi16(tail_mask, src_addr1);
         array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1);
         array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1);
         // Store in one row of block_B
-        MM256_STOREU_EPI16(&block_A[idx_target_base0],      array256_2);
-        MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3);
+        MM256_STOREU_EPI16(dst_addr0,    array256_2);
+        MM256_STOREU_EPI16(dst_addr0+16, array256_3);
 
-        idx_src_base0 += LDA_2x;
-        idx_src_base1 += LDA_2x;
-        idx_target_base0 += 32;
+        src_addr0 += LDA_2x;
+        src_addr1 += LDA_2x;
+        dst_addr0 += 32;
     }
 
     if (tag_k_2x != k) {
         __m256i ZERO256 = _mm256_setzero_si256();
-        array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]);
+        array256_0 = _mm256_maskz_loadu_epi16(tail_mask, src_addr0);
         array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256);
         array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256);
         // Store in one row of block_B
-        MM256_STOREU_EPI16(&block_A[idx_target_base0],      array256_2);
-        MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3);
+        MM256_STOREU_EPI16(dst_addr0,    array256_2);
+        MM256_STOREU_EPI16(dst_addr0+16, array256_3);
     }
+}
 
-#ifdef DEBUG_PROFILE
-   print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A);
-#endif
+// K=32, M=16
+void COL_MAJOR_ITCOPY_KERNEL_32x16(bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+{
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
+    bfloat16 * dst_addr0, * dst_addr1;
+
+    BLASLONG LDA_4x = lda*4;
+
+    src_addr0 = A;
+    src_addr1 = A + lda;
+    src_addr2 = A + lda*2;
+    src_addr3 = A + lda*3;
+    dst_addr0 = block_A;
+    dst_addr1 = block_A + 32*8;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3;
+    __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3;
+    __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3;
+    __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3;
+
+    __m512i M512_EPI64_2   = _mm512_set1_epi64(2);
+    __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
+    __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2);
+
+    // Load and preprocess 1st 4 rows
+    array512_way0_0 = _mm512_loadu_si512(src_addr0);
+    array512_way0_1 = _mm512_loadu_si512(src_addr1);
+    array512_way0_2 = _mm512_loadu_si512(src_addr2);
+    array512_way0_3 = _mm512_loadu_si512(src_addr3);
+    array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1);
+    array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1);
+    array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3);
+    array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3);
+    array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+    array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+    array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+    array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+    src_addr0 += LDA_4x;
+    src_addr1 += LDA_4x;
+    src_addr2 += LDA_4x;
+    src_addr3 += LDA_4x;
+
+    // Load and preprocess 2nd 4 rows
+    array512_way1_0 = _mm512_loadu_si512(src_addr0);
+    array512_way1_1 = _mm512_loadu_si512(src_addr1);
+    array512_way1_2 = _mm512_loadu_si512(src_addr2);
+    array512_way1_3 = _mm512_loadu_si512(src_addr3);
+    array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1);
+    array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1);
+    array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3);
+    array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3);
+    array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+    array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+    array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+    array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+    src_addr0 += LDA_4x;
+    src_addr1 += LDA_4x;
+    src_addr2 += LDA_4x;
+    src_addr3 += LDA_4x;
+
+    // Load and preprocess 3rd 4 rows
+    array512_way2_0 = _mm512_loadu_si512(src_addr0);
+    array512_way2_1 = _mm512_loadu_si512(src_addr1);
+    array512_way2_2 = _mm512_loadu_si512(src_addr2);
+    array512_way2_3 = _mm512_loadu_si512(src_addr3);
+    array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1);
+    array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1);
+    array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3);
+    array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3);
+    array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+    array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+    array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+    array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+    src_addr0 += LDA_4x;
+    src_addr1 += LDA_4x;
+    src_addr2 += LDA_4x;
+    src_addr3 += LDA_4x;
+
+    // Load and preprocess 4th 4 rows
+    array512_way3_0 = _mm512_loadu_si512(src_addr0);
+    array512_way3_1 = _mm512_loadu_si512(src_addr1);
+    array512_way3_2 = _mm512_loadu_si512(src_addr2);
+    array512_way3_3 = _mm512_loadu_si512(src_addr3);
+    array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1);
+    array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1);
+    array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3);
+    array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3);
+    array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+    array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+    array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+    array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+    // Compose and store the 0/1 and 16/17 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 2/3 and 18/19 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 4/5 and 20/21 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 6/7 and 22/23 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 8/9 and 24/25 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 10/11 and 26/27 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 12/13 and 28/29 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 14/15 and 30/31 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
 }
 
-void COL_MAJOR_INCOPY_KERNEL_Kx16m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+// K=Any number but will be processed based on 32, M=32
+void COL_MAJOR_ITCOPY_KERNEL_Kx32(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
 {
-    BLASLONG tag_k_2x = k & (~1);
-    unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m));
-    __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
+    bfloat16 * dst_addr0, * dst_addr1;
 
-    __m256i array256_0, array256_1, array256_2, array256_3;
+    BLASLONG tag_k_32x = k & (~31);
 
-    BLASLONG idx_src_base0, idx_src_base1;
-    BLASLONG idx_target_base0;
+    BLASLONG LDA_4x  = lda*4;
+    BLASLONG LDA_8x  = lda*8;
+    BLASLONG LDA_12x = lda*12;
+    BLASLONG LDA_16x = lda*16;
 
-    BLASLONG LDA_2x = 2*lda;
-    idx_src_base0 = 0;
-    idx_src_base1 = lda;
-    idx_target_base0 = 0;
-    for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
-        array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]);
-        array256_1 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]);
-        array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1);
-        array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1);
-        // Store in one row of block_B
-        MM256_STOREU_EPI16(&block_A[idx_target_base0],      array256_2);
-        MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3);
+    src_addr0 = A;
+    src_addr1 = A + lda;
+    src_addr2 = A + lda*2;
+    src_addr3 = A + lda*3;
+    dst_addr0 = block_A;
+    dst_addr1 = block_A + 32*16;
 
-        idx_src_base0 += LDA_2x;
-        idx_src_base1 += LDA_2x;
-        idx_target_base0 += 32;
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3;
+    __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3;
+    __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3;
+    __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3;
+
+    __m512i M512_EPI64_2   = _mm512_set1_epi64(2);
+    __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
+    __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2);
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        for (int i = 0; i < 2; i++) {
+            // Load and preprocess 1st 4 rows
+            array512_way0_0 = _mm512_loadu_si512(src_addr0+idx_k);
+            array512_way0_1 = _mm512_loadu_si512(src_addr1+idx_k);
+            array512_way0_2 = _mm512_loadu_si512(src_addr2+idx_k);
+            array512_way0_3 = _mm512_loadu_si512(src_addr3+idx_k);
+            array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1);
+            array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1);
+            array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3);
+            array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3);
+            array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+            // Load and preprocess 2nd 4 rows
+            array512_way1_0 = _mm512_loadu_si512(src_addr0+LDA_4x+idx_k);
+            array512_way1_1 = _mm512_loadu_si512(src_addr1+LDA_4x+idx_k);
+            array512_way1_2 = _mm512_loadu_si512(src_addr2+LDA_4x+idx_k);
+            array512_way1_3 = _mm512_loadu_si512(src_addr3+LDA_4x+idx_k);
+            array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1);
+            array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1);
+            array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3);
+            array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3);
+            array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+            // Load and preprocess 3rd 4 rows
+            array512_way2_0 = _mm512_loadu_si512(src_addr0+LDA_8x+idx_k);
+            array512_way2_1 = _mm512_loadu_si512(src_addr1+LDA_8x+idx_k);
+            array512_way2_2 = _mm512_loadu_si512(src_addr2+LDA_8x+idx_k);
+            array512_way2_3 = _mm512_loadu_si512(src_addr3+LDA_8x+idx_k);
+            array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1);
+            array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1);
+            array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3);
+            array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3);
+            array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+            // Load and preprocess 4th 4 rows
+            array512_way3_0 = _mm512_loadu_si512(src_addr0+LDA_12x+idx_k);
+            array512_way3_1 = _mm512_loadu_si512(src_addr1+LDA_12x+idx_k);
+            array512_way3_2 = _mm512_loadu_si512(src_addr2+LDA_12x+idx_k);
+            array512_way3_3 = _mm512_loadu_si512(src_addr3+LDA_12x+idx_k);
+            array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1);
+            array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1);
+            array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3);
+            array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3);
+            array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+            // Compose and store the 0/1 and 16/17 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 2/3 and 18/19 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 4/5 and 20/21 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 6/7 and 22/23 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 8/9 and 24/25 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 10/11 and 26/27 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 12/13 and 28/29 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 14/15 and 30/31 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+
+            src_addr0 += LDA_16x;
+            src_addr1 += LDA_16x;
+            src_addr2 += LDA_16x;
+            src_addr3 += LDA_16x;
+            dst_addr0 -= (64*7 - 32);
+            dst_addr1 -= (64*7 - 32);
+        }
+        src_addr0 -= (LDA_16x*2);
+        src_addr1 -= (LDA_16x*2);
+        src_addr2 -= (LDA_16x*2);
+        src_addr3 -= (LDA_16x*2);
+        dst_addr0 += (32*30);
+        dst_addr1 += (32*30);
     }
 
-    if (tag_k_2x != k) {
-        __m256i ZERO256 = _mm256_setzero_si256();
-        array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]);
-        array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256);
-        array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256);
-        // Store in one row of block_B
-        MM256_STOREU_EPI16(&block_A[idx_target_base0],      array256_2);
-        MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3);
+    if (tag_k_32x != k) {
+        int k_rem = k - tag_k_32x;
+        unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem));
+        __m512i array512[16];
+
+        bfloat16 * dst_addr_tmp = dst_addr0;
+
+        for (int i = 0; i < 2; i++) {
+            // Load and preprocess 1st 4 rows
+            array512[0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+            array512[1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+            array512[2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+            array512[3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+            array512_0 = _mm512_unpacklo_epi32(array512[0], array512[1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[0], array512[1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[2], array512[3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[2], array512[3]);
+            array512[0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+            src_addr0 += LDA_4x;
+            src_addr1 += LDA_4x;
+            src_addr2 += LDA_4x;
+            src_addr3 += LDA_4x;
+
+            // Load and preprocess 2nd 4 rows
+            array512[4] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+            array512[5] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+            array512[6] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+            array512[7] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+            array512_0 = _mm512_unpacklo_epi32(array512[4], array512[5]);
+            array512_1 = _mm512_unpackhi_epi32(array512[4], array512[5]);
+            array512_2 = _mm512_unpacklo_epi32(array512[6], array512[7]);
+            array512_3 = _mm512_unpackhi_epi32(array512[6], array512[7]);
+            array512[4] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[5] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[6] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[7] = _mm512_unpackhi_epi64(array512_1, array512_3);
+            src_addr0 += LDA_4x;
+            src_addr1 += LDA_4x;
+            src_addr2 += LDA_4x;
+            src_addr3 += LDA_4x;
+
+            // Load and preprocess 3rd 4 rows
+            array512[8]  = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+            array512[9]  = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+            array512[10] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+            array512[11] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+            array512_0 = _mm512_unpacklo_epi32(array512[8],  array512[9]);
+            array512_1 = _mm512_unpackhi_epi32(array512[8],  array512[9]);
+            array512_2 = _mm512_unpacklo_epi32(array512[10], array512[11]);
+            array512_3 = _mm512_unpackhi_epi32(array512[10], array512[11]);
+            array512[8]  = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[9]  = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[10] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[11] = _mm512_unpackhi_epi64(array512_1, array512_3);
+            src_addr0 += LDA_4x;
+            src_addr1 += LDA_4x;
+            src_addr2 += LDA_4x;
+            src_addr3 += LDA_4x;
+
+            // Load and preprocess 4th 4 rows
+            array512[12] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+            array512[13] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+            array512[14] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+            array512[15] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+            array512_0 = _mm512_unpacklo_epi32(array512[12], array512[13]);
+            array512_1 = _mm512_unpackhi_epi32(array512[12], array512[13]);
+            array512_2 = _mm512_unpacklo_epi32(array512[14], array512[15]);
+            array512_3 = _mm512_unpackhi_epi32(array512[14], array512[15]);
+            array512[12] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[13] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[14] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[15] = _mm512_unpackhi_epi64(array512_1, array512_3);
+            src_addr0 += LDA_4x;
+            src_addr1 += LDA_4x;
+            src_addr2 += LDA_4x;
+            src_addr3 += LDA_4x;
+
+            // array512_01_1617_0, array512_01_1617_1, array512_89_2425_0, array512_89_2425_1;
+            // Half-compose of 0/1, 16/17, 8/9, 24/25 cols
+            array512_0 = _mm512_permutex2var_epi64(array512[0], permute_lo_idx, array512[4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[8], permute_lo_idx, array512[12]);
+            array512_2 = _mm512_permutex2var_epi64(array512[0], permute_hi_idx, array512[4]);
+            array512_3 = _mm512_permutex2var_epi64(array512[8], permute_hi_idx, array512[12]);
+            array512[0]  = array512_0;  // 1st 8 pairs of col 0/1,   and 1st 8 pairs of col 16/17
+            array512[4]  = array512_1;  // 2nd 8 pairs of col 0/1,   and 2nd 8 pairs of col 16/17
+            array512[8]  = array512_2;  // 1st 8 pairs of col 8/9,   and 1st 8 pairs of col 24/25
+            array512[12] = array512_3;  // 2nd 8 pairs of col 8/9,   and 2nd 8 pairs of col 24/25
+
+            // Half-compose of 2/3, 18/19, 10/11, 26/27 cols
+            array512_0 = _mm512_permutex2var_epi64(array512[1], permute_lo_idx, array512[5]);
+            array512_1 = _mm512_permutex2var_epi64(array512[9], permute_lo_idx, array512[13]);
+            array512_2 = _mm512_permutex2var_epi64(array512[1], permute_hi_idx, array512[5]);
+            array512_3 = _mm512_permutex2var_epi64(array512[9], permute_hi_idx, array512[13]);
+            array512[1]  = array512_0;  // 1st 8 pairs of col 2/3,   and 1st 8 pairs of col 18/19
+            array512[5]  = array512_1;  // 2nd 8 pairs of col 2/3,   and 2nd 8 pairs of col 18/19
+            array512[9]  = array512_2;  // 1st 8 pairs of col 10/11, and 1st 8 pairs of col 26/27
+            array512[13] = array512_3;  // 2nd 8 pairs of col 10/11, and 2nd 8 pairs of col 26/27
+
+            // Half-compose of 4/5, 20/21, 12/13, 28/29 cols
+            array512_0 = _mm512_permutex2var_epi64(array512[2],  permute_lo_idx, array512[6]);
+            array512_1 = _mm512_permutex2var_epi64(array512[10], permute_lo_idx, array512[14]);
+            array512_2 = _mm512_permutex2var_epi64(array512[2],  permute_hi_idx, array512[6]);
+            array512_3 = _mm512_permutex2var_epi64(array512[10], permute_hi_idx, array512[14]);
+            array512[2]  = array512_0;  // 1st 8 pairs of col 4/5,   and 1st 8 pairs of col 20/21
+            array512[6]  = array512_1;  // 2nd 8 pairs of col 4/5,   and 2nd 8 pairs of col 20/21
+            array512[10] = array512_2;  // 1st 8 pairs of col 12/13, and 1st 8 pairs of col 28/29
+            array512[14] = array512_3;  // 2nd 8 pairs of col 12/13, and 2nd 8 pairs of col 28/29
+
+            // Half-compose of 6/7, 22/23, 14/15, 30/31 cols
+            array512_0 = _mm512_permutex2var_epi64(array512[3],  permute_lo_idx, array512[7]);
+            array512_1 = _mm512_permutex2var_epi64(array512[11], permute_lo_idx, array512[15]);
+            array512_2 = _mm512_permutex2var_epi64(array512[3],  permute_hi_idx, array512[7]);
+            array512_3 = _mm512_permutex2var_epi64(array512[11], permute_hi_idx, array512[15]);
+            array512[3]  = array512_0;  // 1st 8 pairs of col 6/7,   and 1st 8 pairs of col 22/23
+            array512[7]  = array512_1;  // 2nd 8 pairs of col 6/7,   and 2nd 8 pairs of col 22/23
+            array512[11] = array512_2;  // 1st 8 pairs of col 14/15, and 1st 8 pairs of col 30/31
+            array512[15] = array512_3;  // 2nd 8 pairs of col 14/15, and 2nd 8 pairs of col 30/31
+
+            // Compose and store the 0/1 cols
+            array512_0 = _mm512_inserti64x4(array512[0], _mm512_castsi512_si256(array512[4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 2/3 cols
+            array512_0 = _mm512_inserti64x4(array512[1], _mm512_castsi512_si256(array512[5]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 4/5 cols
+            array512_0 = _mm512_inserti64x4(array512[2], _mm512_castsi512_si256(array512[6]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 6/7 cols
+            array512_0 = _mm512_inserti64x4(array512[3], _mm512_castsi512_si256(array512[7]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 8/9 cols
+            array512_0 = _mm512_inserti64x4(array512[8], _mm512_castsi512_si256(array512[12]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 10/11 cols
+            array512_0 = _mm512_inserti64x4(array512[9], _mm512_castsi512_si256(array512[13]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 12/13 cols
+            array512_0 = _mm512_inserti64x4(array512[10], _mm512_castsi512_si256(array512[14]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 14/15 cols
+            array512_0 = _mm512_inserti64x4(array512[11], _mm512_castsi512_si256(array512[15]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store 16 ~ k_rem cols
+            int idx_length = (k_rem + 1 - 16) >> 1;
+            if (idx_length > 4) {
+                for (int idx_k = 0; idx_k < 4; idx_k++) {
+                    array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                    _mm512_storeu_si512(dst_addr0, array512_0);
+                    dst_addr0 += 64;
+                }
+
+                for (int idx_k = 4; idx_k < idx_length; idx_k++) {
+                    array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0);
+                    _mm512_storeu_si512(dst_addr0, array512_0);
+                    dst_addr0 += 64;
+                }
+            } else {
+                for (int idx_k = 0; idx_k < idx_length; idx_k++) {
+                    array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                    _mm512_storeu_si512(dst_addr0, array512_0);
+                    dst_addr0 += 64;
+                }
+            }
+
+            dst_addr0 = dst_addr_tmp + 32;
+        }
+    }
+}
+
+// K=Any number but will be processed based on 32, 16<M<32
+void COL_MAJOR_ITCOPY_KERNEL_Kx32m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+{
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
+    bfloat16 * dst_addr0, * dst_addr1;
+
+    BLASLONG tag_k_32x = k & (~31);
+
+    BLASLONG LDA_4x  = lda*4;
+
+    BLASLONG m_rem = m-16;
+
+    src_addr0 = A;
+    src_addr1 = A + lda;
+    src_addr2 = A + lda*2;
+    src_addr3 = A + lda*3;
+    dst_addr0 = block_A;
+    dst_addr1 = block_A + 32*16;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512[16];
+
+    __m512i M512_EPI64_2   = _mm512_set1_epi64(2);
+    __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
+    __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2);
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        for (int j = 0; j < 4; j++) {
+            int array_idx = j*4;
+            // Load and preprocess 4 rows
+            array512[array_idx+0] = _mm512_loadu_si512(src_addr0+idx_k);
+            array512[array_idx+1] = _mm512_loadu_si512(src_addr1+idx_k);
+            array512[array_idx+2] = _mm512_loadu_si512(src_addr2+idx_k);
+            array512[array_idx+3] = _mm512_loadu_si512(src_addr3+idx_k);
+            array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+            src_addr0 += LDA_4x;
+            src_addr1 += LDA_4x;
+            src_addr2 += LDA_4x;
+            src_addr3 += LDA_4x;
+        }
+
+        // Compose and store the 0/1, 2/3, 4/5, 6/7 and 16/17, 18/19, 20/21, 22/23 cols
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+        }
+
+        // Compose and store the 8/9, 10/11, 12/13, 14/15 and 24/25, 26/27, 28/29, 30/31 cols
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+        }
+
+        dst_addr0 -= (64*8 - 32);
+        dst_addr1 -= (64*8 - 32);
+
+        for (int j = 0; j < m_rem; j++) {
+            array512[j] = _mm512_loadu_si512(src_addr0+j*lda+idx_k);
+        }
+        for (int j = m_rem; j < 16; j++) {
+            array512[j] = _mm512_setzero_si512();
+        }
+
+        for (int j = 0; j < 4; j++) {
+            int array_idx = j*4;
+            array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        }
+
+        // Compose and store the 0/1, 2/3, 4/5, 6/7 and 16/17, 18/19, 20/21, 22/23 cols
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+        }
+
+        // Compose and store the 8/9, 10/11, 12/13, 14/15 and 24/25, 26/27, 28/29, 30/31 cols
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+        }
+
+        src_addr0 -= (LDA_4x*4);
+        src_addr1 -= (LDA_4x*4);
+        src_addr2 -= (LDA_4x*4);
+        src_addr3 -= (LDA_4x*4);
+        dst_addr0 += (32*15);
+        dst_addr1 += (32*15);
+    }
+
+    if (tag_k_32x != k) {
+        int k_rem = k - tag_k_32x;
+        int idx_length = (k_rem + 1 - 16) >> 1;
+        unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem));
+        bfloat16 * dst_addr_tmp = dst_addr0;
+
+        for (int j = 0; j < 4; j++) {
+            int array_idx = j*4;
+            // Load and preprocess 4 rows
+            array512[array_idx+0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+            array512[array_idx+1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+            array512[array_idx+2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+            array512[array_idx+3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+            array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+            src_addr0 += LDA_4x;
+            src_addr1 += LDA_4x;
+            src_addr2 += LDA_4x;
+            src_addr3 += LDA_4x;
+        }
+
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]);
+            array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]);
+            array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]);
+            array512[j+0]  = array512_0;  // 1st 8 pairs of col 0/1|2/3|4/5|6/7,   and 1st 8 pairs of col 16/17|18/19|20/21|22/23
+            array512[j+4]  = array512_1;  // 2nd 8 pairs of col 0/1|2/3|4/5|6/7,   and 2nd 8 pairs of col 16/17|18/19|20/21|22/23
+            array512[j+8]  = array512_2;  // 1st 8 pairs of col 8/9|10/11|12/13|14/15,   and 1st 8 pairs of col 24/25|26/27|28/29|30/31
+            array512[j+12] = array512_3;  // 2nd 8 pairs of col 8/9|10/11|12/13|14/15,   and 2nd 8 pairs of col 24/25|26/27|28/29|30/31
+        }
+
+        for (int j = 0; j < 4; j++) {
+            // Compose and store the 0/1 cols
+            array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+        }
+
+        for (int j = 8; j < 12; j++) {
+            array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+        }
+
+        // Compose and store 16 ~ k_rem cols
+        if (idx_length > 4) {
+            for (int idx_k = 0; idx_k < 4; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 64;
+            }
+
+            for (int idx_k = 4; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 64;
+            }
+        } else {
+            for (int idx_k = 0; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 64;
+            }
+        }
+
+        dst_addr0 = dst_addr_tmp + 32;
+
+        for (int j = 0; j < m_rem; j++) {
+            array512[j] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+j*lda+tag_k_32x);
+        }
+        for (int j = m_rem; j < 16; j++) {
+            array512[j] = _mm512_setzero_si512();
+        }
+
+        for (int j = 0; j < 4; j++) {
+            int array_idx = j*4;
+            array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        }
+
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]);
+            array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]);
+            array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]);
+            array512[j+0]  = array512_0;  // 1st 8 pairs of col 0/1|2/3|4/5|6/7,   and 1st 8 pairs of col 16/17|18/19|20/21|22/23
+            array512[j+4]  = array512_1;  // 2nd 8 pairs of col 0/1|2/3|4/5|6/7,   and 2nd 8 pairs of col 16/17|18/19|20/21|22/23
+            array512[j+8]  = array512_2;  // 1st 8 pairs of col 8/9|10/11|12/13|14/15,   and 1st 8 pairs of col 24/25|26/27|28/29|30/31
+            array512[j+12] = array512_3;  // 2nd 8 pairs of col 8/9|10/11|12/13|14/15,   and 2nd 8 pairs of col 24/25|26/27|28/29|30/31
+        }
+
+        for (int j = 0; j < 4; j++) {
+            // Compose and store the 0/1 cols
+            array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+        }
+
+        for (int j = 8; j < 12; j++) {
+            array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+        }
+
+        // Compose and store 16 ~ k_rem cols
+        if (idx_length > 4) {
+            for (int idx_k = 0; idx_k < 4; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 64;
+            }
+
+            for (int idx_k = 4; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 64;
+            }
+        } else {
+            for (int idx_k = 0; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 64;
+            }
+        }
+    }
+}
+
+// K=Any number but will be processed based on 32, M=16
+void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+{
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
+    bfloat16 * dst_addr0, * dst_addr1;
+
+    BLASLONG tag_k_32x = k & (~31);
+
+    BLASLONG LDA_4x  = lda*4;
+    BLASLONG LDA_8x  = lda*8;
+    BLASLONG LDA_12x = lda*12;
+
+    src_addr0 = A;
+    src_addr1 = A + lda;
+    src_addr2 = A + lda*2;
+    src_addr3 = A + lda*3;
+    dst_addr0 = block_A;
+    dst_addr1 = block_A + 32*8;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3;
+    __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3;
+    __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3;
+    __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3;
+
+    __m512i M512_EPI64_2   = _mm512_set1_epi64(2);
+    __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
+    __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2);
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        // Load and preprocess 1st 4 rows
+        array512_way0_0 = _mm512_loadu_si512(src_addr0+idx_k);
+        array512_way0_1 = _mm512_loadu_si512(src_addr1+idx_k);
+        array512_way0_2 = _mm512_loadu_si512(src_addr2+idx_k);
+        array512_way0_3 = _mm512_loadu_si512(src_addr3+idx_k);
+        array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1);
+        array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1);
+        array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3);
+        array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3);
+        array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+        // Load and preprocess 2nd 4 rows
+        array512_way1_0 = _mm512_loadu_si512(src_addr0+LDA_4x+idx_k);
+        array512_way1_1 = _mm512_loadu_si512(src_addr1+LDA_4x+idx_k);
+        array512_way1_2 = _mm512_loadu_si512(src_addr2+LDA_4x+idx_k);
+        array512_way1_3 = _mm512_loadu_si512(src_addr3+LDA_4x+idx_k);
+        array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1);
+        array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1);
+        array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3);
+        array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3);
+        array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+        // Load and preprocess 3rd 4 rows
+        array512_way2_0 = _mm512_loadu_si512(src_addr0+LDA_8x+idx_k);
+        array512_way2_1 = _mm512_loadu_si512(src_addr1+LDA_8x+idx_k);
+        array512_way2_2 = _mm512_loadu_si512(src_addr2+LDA_8x+idx_k);
+        array512_way2_3 = _mm512_loadu_si512(src_addr3+LDA_8x+idx_k);
+        array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1);
+        array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1);
+        array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3);
+        array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3);
+        array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+        // Load and preprocess 4th 4 rows
+        array512_way3_0 = _mm512_loadu_si512(src_addr0+LDA_12x+idx_k);
+        array512_way3_1 = _mm512_loadu_si512(src_addr1+LDA_12x+idx_k);
+        array512_way3_2 = _mm512_loadu_si512(src_addr2+LDA_12x+idx_k);
+        array512_way3_3 = _mm512_loadu_si512(src_addr3+LDA_12x+idx_k);
+        array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1);
+        array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1);
+        array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3);
+        array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3);
+        array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+        // Compose and store the 0/1 and 16/17 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 2/3 and 18/19 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 4/5 and 20/21 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 6/7 and 22/23 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 8/9 and 24/25 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 10/11 and 26/27 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 12/13 and 28/29 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 14/15 and 30/31 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32*9;
+        dst_addr1 += 32*9;
+    }
+
+    if (tag_k_32x != k) {
+        int k_rem = k - tag_k_32x;
+        unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem));
+        __m512i array512[16];
+
+        // Load and preprocess 1st 4 rows
+        array512[0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+        array512[1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+        array512[2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+        array512[3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+        array512_0 = _mm512_unpacklo_epi32(array512[0], array512[1]);
+        array512_1 = _mm512_unpackhi_epi32(array512[0], array512[1]);
+        array512_2 = _mm512_unpacklo_epi32(array512[2], array512[3]);
+        array512_3 = _mm512_unpackhi_epi32(array512[2], array512[3]);
+        array512[0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512[1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512[2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512[3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        src_addr0 += LDA_4x;
+        src_addr1 += LDA_4x;
+        src_addr2 += LDA_4x;
+        src_addr3 += LDA_4x;
+
+        // Load and preprocess 2nd 4 rows
+        array512[4] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+        array512[5] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+        array512[6] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+        array512[7] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+        array512_0 = _mm512_unpacklo_epi32(array512[4], array512[5]);
+        array512_1 = _mm512_unpackhi_epi32(array512[4], array512[5]);
+        array512_2 = _mm512_unpacklo_epi32(array512[6], array512[7]);
+        array512_3 = _mm512_unpackhi_epi32(array512[6], array512[7]);
+        array512[4] = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512[5] = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512[6] = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512[7] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        src_addr0 += LDA_4x;
+        src_addr1 += LDA_4x;
+        src_addr2 += LDA_4x;
+        src_addr3 += LDA_4x;
+
+        // Load and preprocess 3rd 4 rows
+        array512[8]  = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+        array512[9]  = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+        array512[10] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+        array512[11] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+        array512_0 = _mm512_unpacklo_epi32(array512[8],  array512[9]);
+        array512_1 = _mm512_unpackhi_epi32(array512[8],  array512[9]);
+        array512_2 = _mm512_unpacklo_epi32(array512[10], array512[11]);
+        array512_3 = _mm512_unpackhi_epi32(array512[10], array512[11]);
+        array512[8]  = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512[9]  = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512[10] = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512[11] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        src_addr0 += LDA_4x;
+        src_addr1 += LDA_4x;
+        src_addr2 += LDA_4x;
+        src_addr3 += LDA_4x;
+
+        // Load and preprocess 4th 4 rows
+        array512[12] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+        array512[13] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+        array512[14] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+        array512[15] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+        array512_0 = _mm512_unpacklo_epi32(array512[12], array512[13]);
+        array512_1 = _mm512_unpackhi_epi32(array512[12], array512[13]);
+        array512_2 = _mm512_unpacklo_epi32(array512[14], array512[15]);
+        array512_3 = _mm512_unpackhi_epi32(array512[14], array512[15]);
+        array512[12] = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512[13] = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512[14] = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512[15] = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+        // array512_01_1617_0, array512_01_1617_1, array512_89_2425_0, array512_89_2425_1;
+        // Half-compose of 0/1, 16/17, 8/9, 24/25 cols
+        array512_0 = _mm512_permutex2var_epi64(array512[0], permute_lo_idx, array512[4]);
+        array512_1 = _mm512_permutex2var_epi64(array512[8], permute_lo_idx, array512[12]);
+        array512_2 = _mm512_permutex2var_epi64(array512[0], permute_hi_idx, array512[4]);
+        array512_3 = _mm512_permutex2var_epi64(array512[8], permute_hi_idx, array512[12]);
+        array512[0]  = array512_0;  // 1st 8 pairs of col 0/1,   and 1st 8 pairs of col 16/17
+        array512[4]  = array512_1;  // 2nd 8 pairs of col 0/1,   and 2nd 8 pairs of col 16/17
+        array512[8]  = array512_2;  // 1st 8 pairs of col 8/9,   and 1st 8 pairs of col 24/25
+        array512[12] = array512_3;  // 2nd 8 pairs of col 8/9,   and 2nd 8 pairs of col 24/25
+
+        // Half-compose of 2/3, 18/19, 10/11, 26/27 cols
+        array512_0 = _mm512_permutex2var_epi64(array512[1], permute_lo_idx, array512[5]);
+        array512_1 = _mm512_permutex2var_epi64(array512[9], permute_lo_idx, array512[13]);
+        array512_2 = _mm512_permutex2var_epi64(array512[1], permute_hi_idx, array512[5]);
+        array512_3 = _mm512_permutex2var_epi64(array512[9], permute_hi_idx, array512[13]);
+        array512[1]  = array512_0;  // 1st 8 pairs of col 2/3,   and 1st 8 pairs of col 18/19
+        array512[5]  = array512_1;  // 2nd 8 pairs of col 2/3,   and 2nd 8 pairs of col 18/19
+        array512[9]  = array512_2;  // 1st 8 pairs of col 10/11, and 1st 8 pairs of col 26/27
+        array512[13] = array512_3;  // 2nd 8 pairs of col 10/11, and 2nd 8 pairs of col 26/27
+
+        // Half-compose of 4/5, 20/21, 12/13, 28/29 cols
+        array512_0 = _mm512_permutex2var_epi64(array512[2],  permute_lo_idx, array512[6]);
+        array512_1 = _mm512_permutex2var_epi64(array512[10], permute_lo_idx, array512[14]);
+        array512_2 = _mm512_permutex2var_epi64(array512[2],  permute_hi_idx, array512[6]);
+        array512_3 = _mm512_permutex2var_epi64(array512[10], permute_hi_idx, array512[14]);
+        array512[2]  = array512_0;  // 1st 8 pairs of col 4/5,   and 1st 8 pairs of col 20/21
+        array512[6]  = array512_1;  // 2nd 8 pairs of col 4/5,   and 2nd 8 pairs of col 20/21
+        array512[10] = array512_2;  // 1st 8 pairs of col 12/13, and 1st 8 pairs of col 28/29
+        array512[14] = array512_3;  // 2nd 8 pairs of col 12/13, and 2nd 8 pairs of col 28/29
+
+        // Half-compose of 6/7, 22/23, 14/15, 30/31 cols
+        array512_0 = _mm512_permutex2var_epi64(array512[3],  permute_lo_idx, array512[7]);
+        array512_1 = _mm512_permutex2var_epi64(array512[11], permute_lo_idx, array512[15]);
+        array512_2 = _mm512_permutex2var_epi64(array512[3],  permute_hi_idx, array512[7]);
+        array512_3 = _mm512_permutex2var_epi64(array512[11], permute_hi_idx, array512[15]);
+        array512[3]  = array512_0;  // 1st 8 pairs of col 6/7,   and 1st 8 pairs of col 22/23
+        array512[7]  = array512_1;  // 2nd 8 pairs of col 6/7,   and 2nd 8 pairs of col 22/23
+        array512[11] = array512_2;  // 1st 8 pairs of col 14/15, and 1st 8 pairs of col 30/31
+        array512[15] = array512_3;  // 2nd 8 pairs of col 14/15, and 2nd 8 pairs of col 30/31
+
+        // Compose and store the 0/1 cols
+        array512_0 = _mm512_inserti64x4(array512[0], _mm512_castsi512_si256(array512[4]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 2/3 cols
+        array512_0 = _mm512_inserti64x4(array512[1], _mm512_castsi512_si256(array512[5]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 4/5 cols
+        array512_0 = _mm512_inserti64x4(array512[2], _mm512_castsi512_si256(array512[6]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 6/7 cols
+        array512_0 = _mm512_inserti64x4(array512[3], _mm512_castsi512_si256(array512[7]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 8/9 cols
+        array512_0 = _mm512_inserti64x4(array512[8], _mm512_castsi512_si256(array512[12]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 10/11 cols
+        array512_0 = _mm512_inserti64x4(array512[9], _mm512_castsi512_si256(array512[13]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 12/13 cols
+        array512_0 = _mm512_inserti64x4(array512[10], _mm512_castsi512_si256(array512[14]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 14/15 cols
+        array512_0 = _mm512_inserti64x4(array512[11], _mm512_castsi512_si256(array512[15]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store 16 ~ k_rem cols
+        int idx_length = (k_rem + 1 - 16) >> 1;
+        if (idx_length > 4) {
+            for (int idx_k = 0; idx_k < 4; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 32;
+            }
+
+            for (int idx_k = 4; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 32;
+            }
+        } else {
+            for (int idx_k = 0; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 32;
+            }
+        }
+    }
+}
+
+// K=Any number but will be processed based on 32, M<=16
+void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+{
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
+    bfloat16 * dst_addr0, * dst_addr1;
+
+    BLASLONG tag_k_32x = k & (~31);
+
+    src_addr0 = A;
+    dst_addr0 = block_A;
+    dst_addr1 = block_A + 32*8;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512[16];
+
+    __m512i M512_EPI64_2   = _mm512_set1_epi64(2);
+    __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
+    __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2);
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        for (int j = 0; j < m; j++) {
+            array512[j] = _mm512_loadu_si512(src_addr0+j*lda+idx_k);
+        }
+        for (int j = m; j < 16; j++) {
+            array512[j] = _mm512_setzero_si512();
+        }
+
+        for (int j = 0; j < 4; j++) {
+            int array_idx = j*4;
+            array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        }
+
+        // Compose and store the 0/1, 2/3, 4/5, 6/7 and 16/17, 18/19, 20/21, 22/23 cols
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 32;
+            dst_addr1 += 32;
+        }
+
+        // Compose and store the 8/9, 10/11, 12/13, 14/15 and 24/25, 26/27, 28/29, 30/31 cols
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 32;
+            dst_addr1 += 32;
+        }
+
+        dst_addr0 += 32*8;
+        dst_addr1 += 32*8;
     }
 
-#ifdef DEBUG_PROFILE
-   print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A);
-#endif
+    if (tag_k_32x != k) {
+        int k_rem = k - tag_k_32x;
+        unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem));
+
+        for (int j = 0; j < m; j++) {
+            array512[j] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+j*lda+tag_k_32x);
+        }
+        for (int j = m; j < 16; j++) {
+            array512[j] = _mm512_setzero_si512();
+        }
+
+        for (int j = 0; j < 4; j++) {
+            int array_idx = j*4;
+            array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        }
+
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]);
+            array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]);
+            array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]);
+            array512[j+0]  = array512_0;  // 1st 8 pairs of col 0/1|2/3|4/5|6/7,   and 1st 8 pairs of col 16/17|18/19|20/21|22/23
+            array512[j+4]  = array512_1;  // 2nd 8 pairs of col 0/1|2/3|4/5|6/7,   and 2nd 8 pairs of col 16/17|18/19|20/21|22/23
+            array512[j+8]  = array512_2;  // 1st 8 pairs of col 8/9|10/11|12/13|14/15,   and 1st 8 pairs of col 24/25|26/27|28/29|30/31
+            array512[j+12] = array512_3;  // 2nd 8 pairs of col 8/9|10/11|12/13|14/15,   and 2nd 8 pairs of col 24/25|26/27|28/29|30/31
+        }
+
+        for (int j = 0; j < 4; j++) {
+            // Compose and store the 0/1 cols
+            array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 32;
+        }
+
+        for (int j = 8; j < 12; j++) {
+            array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 32;
+        }
+
+        // Compose and store 16 ~ k_rem cols
+        int idx_length = (k_rem + 1 - 16) >> 1;
+        if (idx_length > 4) {
+            for (int idx_k = 0; idx_k < 4; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 32;
+            }
+
+            for (int idx_k = 4; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 32;
+            }
+        } else {
+            for (int idx_k = 0; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 32;
+            }
+        }
+    }
 }
 
+// COL_MAJOR_ONCOPY_KERNEL_16x32 behaves exactly the same as COL_MAJOR_ITCOPY_KERNEL_Kx16
+#define COL_MAJOR_ONCOPY_KERNEL_16x32 COL_MAJOR_ITCOPY_KERNEL_Kx16
+
 void COL_MAJOR_ONCOPY_KERNEL_8x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
 {
     BLASLONG tag_k_32x = k & (~31);
-    BLASLONG idx_src_base0, idx_src_base1, idx_src_base2, idx_src_base3, idx_src_base4, idx_src_base5, idx_src_base6, idx_src_base7;
-    BLASLONG idx_target_base0;
 
-    idx_src_base0 = 0;
-    idx_src_base1 = 1*ldb;
-    idx_src_base2 = 2*ldb;
-    idx_src_base3 = 3*ldb;
-    idx_src_base4 = 4*ldb;
-    idx_src_base5 = 5*ldb;
-    idx_src_base6 = 6*ldb;
-    idx_src_base7 = 7*ldb;
-    idx_target_base0 = 0;
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3, * src_addr4, * src_addr5, * src_addr6, * src_addr7;
+    bfloat16 * dst_addr0;
+
+    unsigned char blend_mask = (((unsigned char)0xcc));
+    __m512i permute_idx = _mm512_set_epi64(13, 12, 7, 6, 9, 8, 3, 2);
+
+    src_addr0 = B;
+    src_addr1 = src_addr0 + 1*ldb;
+    src_addr2 = src_addr0 + 2*ldb;
+    src_addr3 = src_addr0 + 3*ldb;
+    src_addr4 = src_addr0 + 4*ldb;
+    src_addr5 = src_addr0 + 5*ldb;
+    src_addr6 = src_addr0 + 6*ldb;
+    src_addr7 = src_addr0 + 7*ldb;
+    dst_addr0 = block_B;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3;
+    __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3;
 
     for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0+idx_k]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base1+idx_k]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_loadu_si512(&B[idx_src_base2+idx_k]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_loadu_si512(&B[idx_src_base3+idx_k]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_loadu_si512(&B[idx_src_base4+idx_k]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_loadu_si512(&B[idx_src_base5+idx_k]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_loadu_si512(&B[idx_src_base6+idx_k]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_loadu_si512(&B[idx_src_base7+idx_k]));
-        idx_target_base0 += 32*8;
+        array512_0 = _mm512_loadu_si512(src_addr0+idx_k);
+        array512_1 = _mm512_loadu_si512(src_addr1+idx_k);
+        array512_2 = _mm512_loadu_si512(src_addr2+idx_k);
+        array512_3 = _mm512_loadu_si512(src_addr3+idx_k);
+
+        array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1);
+        array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1);
+        array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3);
+        array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3);
+
+        array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2);
+        array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2);
+        array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3);
+        array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3);
+
+        array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88);
+        array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd);
+        array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88);
+        array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd);
+
+        array512_0 = _mm512_loadu_si512(src_addr4+idx_k);
+        array512_1 = _mm512_loadu_si512(src_addr5+idx_k);
+        array512_2 = _mm512_loadu_si512(src_addr6+idx_k);
+        array512_3 = _mm512_loadu_si512(src_addr7+idx_k);
+
+        array512_way1_0 = _mm512_unpacklo_epi32(array512_0, array512_1);
+        array512_way1_1 = _mm512_unpackhi_epi32(array512_0, array512_1);
+        array512_way1_2 = _mm512_unpacklo_epi32(array512_2, array512_3);
+        array512_way1_3 = _mm512_unpackhi_epi32(array512_2, array512_3);
+
+        array512_0 = _mm512_unpacklo_epi64(array512_way1_0, array512_way1_2);
+        array512_1 = _mm512_unpackhi_epi64(array512_way1_0, array512_way1_2);
+        array512_2 = _mm512_unpacklo_epi64(array512_way1_1, array512_way1_3);
+        array512_3 = _mm512_unpackhi_epi64(array512_way1_1, array512_way1_3);
+
+        array512_way1_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x22);
+        array512_way1_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x77);
+        array512_way1_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x22);
+        array512_way1_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x77);
+
+        array512_0 = _mm512_mask_blend_epi64(blend_mask, array512_way0_0, array512_way1_0);
+        array512_1 = _mm512_mask_blend_epi64(blend_mask, array512_way0_1, array512_way1_1);
+        array512_2 = _mm512_mask_blend_epi64(blend_mask, array512_way0_2, array512_way1_2);
+        array512_3 = _mm512_mask_blend_epi64(blend_mask, array512_way0_3, array512_way1_3);
+        _mm512_storeu_si512(dst_addr0,    array512_0);
+        _mm512_storeu_si512(dst_addr0+32, array512_1);
+        _mm512_storeu_si512(dst_addr0+64, array512_2);
+        _mm512_storeu_si512(dst_addr0+96, array512_3);
+
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_idx, array512_way1_0);
+        array512_1 = _mm512_permutex2var_epi64(array512_way0_1, permute_idx, array512_way1_1);
+        array512_2 = _mm512_permutex2var_epi64(array512_way0_2, permute_idx, array512_way1_2);
+        array512_3 = _mm512_permutex2var_epi64(array512_way0_3, permute_idx, array512_way1_3);
+        _mm512_storeu_si512(dst_addr0+128, array512_0);
+        _mm512_storeu_si512(dst_addr0+160, array512_1);
+        _mm512_storeu_si512(dst_addr0+192, array512_2);
+        _mm512_storeu_si512(dst_addr0+224, array512_3);
+
+        dst_addr0 += 256;
     }
 
     if (tag_k_32x != k) {
         unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x)));
         __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0+tag_k_32x]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base1+tag_k_32x]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base2+tag_k_32x]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base3+tag_k_32x]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base4+tag_k_32x]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base5+tag_k_32x]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base6+tag_k_32x]));
-        _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base7+tag_k_32x]));
+        array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+        array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+        array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+        array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+
+        array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1);
+        array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1);
+        array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3);
+        array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3);
+
+        array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2);
+        array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2);
+        array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3);
+        array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3);
+
+        array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88);
+        array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd);
+        array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88);
+        array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd);
+
+        array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr4+tag_k_32x);
+        array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr5+tag_k_32x);
+        array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr6+tag_k_32x);
+        array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr7+tag_k_32x);
+
+        array512_way1_0 = _mm512_unpacklo_epi32(array512_0, array512_1);
+        array512_way1_1 = _mm512_unpackhi_epi32(array512_0, array512_1);
+        array512_way1_2 = _mm512_unpacklo_epi32(array512_2, array512_3);
+        array512_way1_3 = _mm512_unpackhi_epi32(array512_2, array512_3);
+
+        array512_0 = _mm512_unpacklo_epi64(array512_way1_0, array512_way1_2);
+        array512_1 = _mm512_unpackhi_epi64(array512_way1_0, array512_way1_2);
+        array512_2 = _mm512_unpacklo_epi64(array512_way1_1, array512_way1_3);
+        array512_3 = _mm512_unpackhi_epi64(array512_way1_1, array512_way1_3);
+
+        array512_way1_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x22);
+        array512_way1_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x77);
+        array512_way1_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x22);
+        array512_way1_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x77);
+
+
+        array512_0 = _mm512_mask_blend_epi64(blend_mask, array512_way0_0, array512_way1_0);
+        array512_1 = _mm512_mask_blend_epi64(blend_mask, array512_way0_1, array512_way1_1);
+        array512_2 = _mm512_mask_blend_epi64(blend_mask, array512_way0_2, array512_way1_2);
+        array512_3 = _mm512_mask_blend_epi64(blend_mask, array512_way0_3, array512_way1_3);
+        _mm512_storeu_si512(dst_addr0,    array512_0);
+        _mm512_storeu_si512(dst_addr0+32, array512_1);
+        _mm512_storeu_si512(dst_addr0+64, array512_2);
+        _mm512_storeu_si512(dst_addr0+96, array512_3);
+
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_idx, array512_way1_0);
+        array512_1 = _mm512_permutex2var_epi64(array512_way0_1, permute_idx, array512_way1_1);
+        array512_2 = _mm512_permutex2var_epi64(array512_way0_2, permute_idx, array512_way1_2);
+        array512_3 = _mm512_permutex2var_epi64(array512_way0_3, permute_idx, array512_way1_3);
+        _mm512_storeu_si512(dst_addr0+128, array512_0);
+        _mm512_storeu_si512(dst_addr0+160, array512_1);
+        _mm512_storeu_si512(dst_addr0+192, array512_2);
+        _mm512_storeu_si512(dst_addr0+224, array512_3);
+    }
+}
+
+void COL_MAJOR_ONCOPY_KERNEL_4x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
+{
+    BLASLONG tag_k_32x = k & (~31);
+
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
+    bfloat16 * dst_addr0;
+
+    src_addr0 = B;
+    src_addr1 = src_addr0 + 1*ldb;
+    src_addr2 = src_addr0 + 2*ldb;
+    src_addr3 = src_addr0 + 3*ldb;
+    dst_addr0 = block_B;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3;
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        array512_0 = _mm512_loadu_si512(src_addr0+idx_k);
+        array512_1 = _mm512_loadu_si512(src_addr1+idx_k);
+        array512_2 = _mm512_loadu_si512(src_addr2+idx_k);
+        array512_3 = _mm512_loadu_si512(src_addr3+idx_k);
+
+        array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1);
+        array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1);
+        array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3);
+        array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3);
+
+        array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2);
+        array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2);
+        array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3);
+        array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3);
+
+        array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88);
+        array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd);
+        array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88);
+        array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd);
+
+        array512_0 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0x88);
+        array512_1 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0x88);
+        array512_2 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0xdd);
+        array512_3 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0xdd);
+
+        _mm512_storeu_si512(dst_addr0,    array512_0);
+        _mm512_storeu_si512(dst_addr0+32, array512_1);
+        _mm512_storeu_si512(dst_addr0+64, array512_2);
+        _mm512_storeu_si512(dst_addr0+96, array512_3);
+
+        dst_addr0 += 128;
     }
 
-#ifdef DEBUG_PROFILE
-   print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B);
-#endif
+    if (tag_k_32x != k) {
+        unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x)));
+        __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+        array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+        array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+        array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+        array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+
+        array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1);
+        array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1);
+        array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3);
+        array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3);
+
+        array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2);
+        array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2);
+        array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3);
+        array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3);
+
+        array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88);
+        array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd);
+        array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88);
+        array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd);
+
+        array512_0 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0x88);
+        array512_1 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0x88);
+        array512_2 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0xdd);
+        array512_3 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0xdd);
+
+        _mm512_storeu_si512(dst_addr0,    array512_0);
+        _mm512_storeu_si512(dst_addr0+32, array512_1);
+        _mm512_storeu_si512(dst_addr0+64, array512_2);
+        _mm512_storeu_si512(dst_addr0+96, array512_3);
+    }
 }
 
 void COL_MAJOR_ONCOPY_KERNEL_Nx32(BLASLONG n, BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
 {
     BLASLONG tag_k_32x = k & (~31);
     BLASLONG tag_n_2x  = n & (~1);
-    BLASLONG idx_src_base0;
-    BLASLONG idx_target_base0;
+
+    bfloat16 * src_addr0;
+    bfloat16 * dst_addr0;
 
     BLASLONG LDB_2x = 2*ldb;
 
-    idx_target_base0 = 0;
+    src_addr0 = B;
+    dst_addr0 = block_B;
 
     for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
-        idx_src_base0 = 0;
+        src_addr0 = B;
         for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) {
-            _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0       + idx_k]));
-            _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base0 + ldb + idx_k]));
-            idx_src_base0 += LDB_2x;
-            idx_target_base0 += 64;
+            _mm512_storeu_si512(dst_addr0,      _mm512_loadu_si512(src_addr0 + idx_k));
+            _mm512_storeu_si512(dst_addr0 + 32, _mm512_loadu_si512(src_addr0 + ldb + idx_k));
+            src_addr0 += LDB_2x;
+            dst_addr0 += 64;
         }
         
         if (tag_n_2x != n) {
-            _mm512_storeu_si512(&block_B[idx_target_base0],  _mm512_loadu_si512(&B[idx_src_base0       + idx_k]));
-            idx_target_base0 += 32;
+            _mm512_storeu_si512(dst_addr0,  _mm512_loadu_si512(src_addr0 + idx_k));
+            dst_addr0 += 32;
         }
     }
 
     if (tag_k_32x != k) {
         unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x)));
         __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
-        idx_src_base0 = 0;
+        src_addr0 = B;
         for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) {
-            _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0       + tag_k_32x]));
-            _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + ldb + tag_k_32x]));
-            idx_src_base0 += LDB_2x;
-            idx_target_base0 += 64;
+            _mm512_storeu_si512(dst_addr0,      _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + tag_k_32x));
+            _mm512_storeu_si512(dst_addr0 + 32, _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + ldb + tag_k_32x));
+            src_addr0 += LDB_2x;
+            dst_addr0 += 64;
         }
         
         if (tag_n_2x != n) {
-            _mm512_storeu_si512(&block_B[idx_target_base0],  _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0       + tag_k_32x]));
+            _mm512_storeu_si512(dst_addr0,  _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + tag_k_32x));
         }
     }
+}
+
+void COL_MAJOR_OTCOPY_KERNEL_Kx8(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
+{
+    BLASLONG tag_k_2x = k & (~1);
+    unsigned char tail_mask_value = (unsigned char) 0xff;
+    __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
 
-#ifdef DEBUG_PROFILE
-   print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B);
-#endif
+    __m128i array128_0, array128_1, array128_2, array128_3;
+
+    BLASLONG idx_src_base0, idx_src_base1;
+    BLASLONG idx_target_base0, idx_target_base1;
+
+    BLASLONG LDA_2x = 2*ldb;
+    BLASLONG BF16_BLOCK_T_M_2x = 2*8;
+    idx_src_base0 = 0;
+    idx_src_base1 = ldb;
+    idx_target_base0 = 0;
+    idx_target_base1 = 8;
+    for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
+        array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]);
+        array128_1 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base1]);
+        array128_2 = _mm_unpacklo_epi16(array128_0, array128_1);
+        array128_3 = _mm_unpackhi_epi16(array128_0, array128_1);
+        _mm_storeu_epi32(&block_B[idx_target_base0], array128_2);
+        _mm_storeu_epi32(&block_B[idx_target_base1], array128_3);
+
+        idx_src_base0 += LDA_2x;
+        idx_src_base1 += LDA_2x;
+        idx_target_base0 += BF16_BLOCK_T_M_2x;
+        idx_target_base1 += BF16_BLOCK_T_M_2x;
+    }
+
+    if (tag_k_2x != k) {
+        __m128i ZERO128 = _mm_setzero_si128();
+        array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]);
+        array128_2 = _mm_unpacklo_epi16(array128_0, ZERO128);
+        array128_3 = _mm_unpackhi_epi16(array128_0, ZERO128);
+        _mm_storeu_epi32(&block_B[idx_target_base0], array128_2);
+        _mm_storeu_epi32(&block_B[idx_target_base1], array128_3);
+   }
+}
+
+void COL_MAJOR_OTCOPY_KERNEL_Kx8m(BLASLONG k, BLASLONG n, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
+{
+    BLASLONG tag_k_2x = k & (~1);
+    unsigned char tail_mask = (((unsigned char)0xff) >> (8-n));
+
+    __m128i array128_0, array128_1, array128_2, array128_3;
+
+    BLASLONG idx_src_base0, idx_src_base1;
+    BLASLONG idx_target_base0, idx_target_base1;
+
+    BLASLONG LDA_2x = 2*ldb;
+    BLASLONG BF16_BLOCK_T_M_2x = 2*8;
+    idx_src_base0 = 0;
+    idx_src_base1 = ldb;
+    idx_target_base0 = 0;
+    idx_target_base1 = 8;
+    for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
+        array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]);
+        array128_1 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base1]);
+        array128_2 = _mm_unpacklo_epi16(array128_0, array128_1);
+        array128_3 = _mm_unpackhi_epi16(array128_0, array128_1);
+        _mm_storeu_epi32(&block_B[idx_target_base0], array128_2);
+        _mm_storeu_epi32(&block_B[idx_target_base1], array128_3);
+
+        idx_src_base0 += LDA_2x;
+        idx_src_base1 += LDA_2x;
+        idx_target_base0 += BF16_BLOCK_T_M_2x;
+        idx_target_base1 += BF16_BLOCK_T_M_2x;
+    }
+
+    if (tag_k_2x != k) {
+        __m128i ZERO128 = _mm_setzero_si128();
+        array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]);
+        array128_2 = _mm_unpacklo_epi16(array128_0, ZERO128);
+        array128_3 = _mm_unpackhi_epi16(array128_0, ZERO128);
+        _mm_storeu_epi32(&block_B[idx_target_base0], array128_2);
+        _mm_storeu_epi32(&block_B[idx_target_base1], array128_3);
+   }
 }
 
-// Scale matrix C while beta is not ZERO or ONE
+// Scale matrix C when beta is not ZERO or ONE
 void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc)
 {
-    BLASLONG tag_n_Nx = N & (~3);
-    BLASLONG tag_n_Mx = M & (~15);
+    float * C_addr0 = C;
+    float * C_addr1 = C + ldc;
+    float * C_addr2 = C + ldc*2;
+    float * C_addr3 = C + ldc*3;
 
     BLASLONG LDC4x = ldc*4;
-    BLASLONG idx_base_0 = 0;
-    BLASLONG idx_base_1 = ldc;
-    BLASLONG idx_base_2 = ldc*2;
-    BLASLONG idx_base_3 = ldc*3;
-
-    unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx));
-    __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
 
     __m512 array_512_0, array_512_1, array_512_2, array_512_3;
+    __m512 BETAVECTOR  = _mm512_set1_ps(beta);
 
-    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+    if (Order == CblasRowMajor) {
+        blasint tmp = M;
+        M = N;
+        N = tmp;
+    }
 
-    if (Order == CblasColMajor) {
-        for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) {
-            for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
-                array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]);
-                array_512_1 = _mm512_loadu_ps(&C[idx_base_1+idx_m]);
-                array_512_2 = _mm512_loadu_ps(&C[idx_base_2+idx_m]);
-                array_512_3 = _mm512_loadu_ps(&C[idx_base_3+idx_m]);
+    BLASLONG tag_n_Nx = N & (~3);
+    BLASLONG tag_n_Mx = M & (~15);
+    unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx));
+    for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) {
+        for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
+            array_512_0 = _mm512_loadu_ps(C_addr0 + idx_m);
+            array_512_1 = _mm512_loadu_ps(C_addr1 + idx_m);
+            array_512_2 = _mm512_loadu_ps(C_addr2 + idx_m);
+            array_512_3 = _mm512_loadu_ps(C_addr3 + idx_m);
+
+            array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
+            array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1);
+            array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2);
+            array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3);
+
+            _mm512_storeu_ps(C_addr0 + idx_m, array_512_0);
+            _mm512_storeu_ps(C_addr1 + idx_m, array_512_1);
+            _mm512_storeu_ps(C_addr2 + idx_m, array_512_2);
+            _mm512_storeu_ps(C_addr3 + idx_m, array_512_3);
+        }
 
-                array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
-                array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1);
-                array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2);
-                array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3);
-
-                _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0);
-                _mm512_storeu_ps(&C[idx_base_1+idx_m], array_512_1);
-                _mm512_storeu_ps(&C[idx_base_2+idx_m], array_512_2);
-                _mm512_storeu_ps(&C[idx_base_3+idx_m], array_512_3);
-            }
+        if (tag_n_Mx != M) {
+            array_512_0 = _mm512_maskz_loadu_ps(tail_mask, C_addr0 + tag_n_Mx);
+            array_512_1 = _mm512_maskz_loadu_ps(tail_mask, C_addr1 + tag_n_Mx);
+            array_512_2 = _mm512_maskz_loadu_ps(tail_mask, C_addr2 + tag_n_Mx);
+            array_512_3 = _mm512_maskz_loadu_ps(tail_mask, C_addr3 + tag_n_Mx);
+
+            array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
+            array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1);
+            array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2);
+            array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3);
+
+            _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, array_512_0);
+            _mm512_mask_storeu_ps(C_addr1 + tag_n_Mx, tail_mask, array_512_1);
+            _mm512_mask_storeu_ps(C_addr2 + tag_n_Mx, tail_mask, array_512_2);
+            _mm512_mask_storeu_ps(C_addr3 + tag_n_Mx, tail_mask, array_512_3);
+        }
 
-            if (tag_n_Mx != M) {
-                array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]);
-                array_512_1 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_1+tag_n_Mx]);
-                array_512_2 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_2+tag_n_Mx]);
-                array_512_3 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_3+tag_n_Mx]);
+        C_addr0 += LDC4x;
+        C_addr1 += LDC4x;
+        C_addr2 += LDC4x;
+        C_addr3 += LDC4x;
+    }
 
+    if (tag_n_Nx != N) {
+        for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) {
+            for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
+                array_512_0 = _mm512_loadu_ps(C_addr0 + idx_m);
                 array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
-                array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1);
-                array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2);
-                array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3);
-
-                _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0);
-                _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, array_512_1);
-                _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, array_512_2);
-                _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, array_512_3);
+                _mm512_storeu_ps(C_addr0 + idx_m, array_512_0);
             }
 
-            idx_base_0 += LDC4x;
-            idx_base_1 += LDC4x;
-            idx_base_2 += LDC4x;
-            idx_base_3 += LDC4x;
-        }
-
-        if (tag_n_Nx != N) {
-            for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) {
-                for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
-                    array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]);
-                    array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
-                    _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0);
-                }
-
-                if (tag_n_Mx != M) {
-                    array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]);
-                    array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
-                    _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0);
-                }
-                idx_base_0 += ldc;
+            if (tag_n_Mx != M) {
+                array_512_0 = _mm512_maskz_loadu_ps(tail_mask, C_addr0 + tag_n_Mx);
+                array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
+                _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, array_512_0);
             }
+            C_addr0 += ldc;
         }
-    } else {
-
     }
 }
 
-// Scale matrix C while beta is not ZERO or ONE
+// Zero C matrix when Beta is 0
 void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, float *C, OPENBLAS_CONST blasint ldc)
 {
-    BLASLONG tag_n_Nx = N & (~3);
-    BLASLONG tag_n_Mx = M & (~15);
+    float * C_addr0 = C;
+    float * C_addr1 = C + ldc;
+    float * C_addr2 = C + ldc*2;
+    float * C_addr3 = C + ldc*3;
 
     BLASLONG LDC4x = ldc*4;
-    BLASLONG idx_base_0 = 0;
-    BLASLONG idx_base_1 = ldc;
-    BLASLONG idx_base_2 = ldc*2;
-    BLASLONG idx_base_3 = ldc*3;
-
-    unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx));
-    __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
 
     __m512  ZEROVECTOR  = _mm512_setzero_ps();
 
-    if (Order == CblasColMajor) {
-        for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) {
-            for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
-                _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR);
-                _mm512_storeu_ps(&C[idx_base_1+idx_m], ZEROVECTOR);
-                _mm512_storeu_ps(&C[idx_base_2+idx_m], ZEROVECTOR);
-                _mm512_storeu_ps(&C[idx_base_3+idx_m], ZEROVECTOR);
-            }
+    if (Order == CblasRowMajor) {
+        blasint tmp = M;
+        M = N;
+        N = tmp;
+    }
 
-            if (tag_n_Mx != M) {
-                _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR);
-                _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, ZEROVECTOR);
-                _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, ZEROVECTOR);
-                _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, ZEROVECTOR);
-            }
+    BLASLONG tag_n_Nx = N & (~3);
+    BLASLONG tag_n_Mx = M & (~15);
+    unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx));
+    for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) {
+        for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
+            _mm512_storeu_ps(C_addr0 + idx_m, ZEROVECTOR);
+            _mm512_storeu_ps(C_addr1 + idx_m, ZEROVECTOR);
+            _mm512_storeu_ps(C_addr2 + idx_m, ZEROVECTOR);
+            _mm512_storeu_ps(C_addr3 + idx_m, ZEROVECTOR);
+        }
 
-            idx_base_0 += LDC4x;
-            idx_base_1 += LDC4x;
-            idx_base_2 += LDC4x;
-            idx_base_3 += LDC4x;
+        if (tag_n_Mx != M) {
+            _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, ZEROVECTOR);
+            _mm512_mask_storeu_ps(C_addr1 + tag_n_Mx, tail_mask, ZEROVECTOR);
+            _mm512_mask_storeu_ps(C_addr2 + tag_n_Mx, tail_mask, ZEROVECTOR);
+            _mm512_mask_storeu_ps(C_addr3 + tag_n_Mx, tail_mask, ZEROVECTOR);
         }
 
-        if (tag_n_Nx != N) {
-            for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) {
-                for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
-                    _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR);
-                }
+        C_addr0 += LDC4x;
+        C_addr1 += LDC4x;
+        C_addr2 += LDC4x;
+        C_addr3 += LDC4x;
+    }
 
-                if (tag_n_Mx != M) {
-                    _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR);
-                }
-                idx_base_0 += ldc;
+    if (tag_n_Nx != N) {
+        for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) {
+            for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
+                _mm512_storeu_ps(C_addr0 + idx_m, ZEROVECTOR);
             }
-        }
-    } else {
 
+            if (tag_n_Mx != M) {
+                _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, ZEROVECTOR);
+            }
+            C_addr0 += ldc;
+        }
     }
-}
\ No newline at end of file
+}
diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c
index dd4cb440b..c71595813 100644
--- a/kernel/x86_64/sbgemm_microk_cooperlake_template.c
+++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c
@@ -2,45 +2,115 @@
 #include "bf16_common_macros.h"
 #include <immintrin.h>
 
+/*  These macros are needed and should be placed at the right place
+#define BF16_BLOCK_STEP_N 8
+#define BF16_BLOCK_THRES_K 1024
+#define BF16_BLOCK_THRES_M 32
+#define BF16_BLOCK_THRES_N 1024
+
+#define A(i,j) A[(i)*lda+(j)]
+#define B(i,j) B[(i)*ldb+(j)]
+#define C(i,j) C[(i)*ldc+(j)]
+
+#define ONE  1.e0f
+#define ZERO  0.e0f
+*/
+
 #undef STORE16_COMPLETE_RESULT
 #undef STORE16_MASK_COMPLETE_RESULT
-#undef SBGEMM_BLOCK_KERNEL_32x8x32
-#undef SBGEMM_BLOCK_KERNEL_16x8x32
-#undef SBGEMM_BLOCK_KERNEL_32xNx32
-#undef SBGEMM_BLOCK_KERNEL_16xNx32
-#undef SBGEMM_BLOCKING_KERNEL_2
+#undef SBGEMM_BLOCK_KERNEL_NN_32x8xK
+#undef SBGEMM_BLOCK_KERNEL_NN_16x8xK
+#undef SBGEMM_BLOCK_KERNEL_NN_32xNx32
+#undef SBGEMM_BLOCK_KERNEL_NN_16xNx32
+#undef SBGEMM_BLOCK_KERNEL_NT_32x8xK
+#undef SBGEMM_BLOCK_KERNEL_NT_16x8xK
+#undef SBGEMM_BLOCK_KERNEL_NT_32xNxK
+#undef SBGEMM_BLOCK_KERNEL_NT_16xNxK
+#undef SBGEMM_BLOCK_KERNEL_TN_32x8xK
+#undef SBGEMM_BLOCK_KERNEL_TN_16x8xK
+#undef SBGEMM_BLOCK_KERNEL_TN_32xNx32
+#undef SBGEMM_BLOCK_KERNEL_TN_16xNx32
+#undef SBGEMM_BLOCK_KERNEL_TT_32x8xK
+#undef SBGEMM_BLOCK_KERNEL_TT_16x8xK
+#undef SBGEMM_BLOCK_KERNEL_TT_32xNxK
+#undef SBGEMM_BLOCK_KERNEL_TT_16xNxK
+#undef SBGEMM_BLOCKING_KERNEL_NN
+#undef SBGEMM_BLOCKING_KERNEL_NT
+#undef SBGEMM_BLOCKING_KERNEL_TN
+#undef SBGEMM_BLOCKING_KERNEL_TT
 
 #ifndef ONE_ALPHA      // ALPHA is not ONE
-    #define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_ONE
-    #define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
-    #define SBGEMM_BLOCK_KERNEL_32x8x32   sbgemm_block_kernel_32x8x32_alpha
-    #define SBGEMM_BLOCK_KERNEL_16x8x32   sbgemm_block_kernel_16x8x32_alpha
-    #define SBGEMM_BLOCK_KERNEL_32xNx32   sbgemm_block_kernel_32xNx32_alpha
-    #define SBGEMM_BLOCK_KERNEL_16xNx32   sbgemm_block_kernel_16xNx32_alpha
-    #define SBGEMM_BLOCKING_KERNEL_2      sbgemm_blocking_kernel_2_alpha
+    #define STORE16_COMPLETE_RESULT          STORE16_COMPLETE_RESULT_ALPHA_ONE
+    #define STORE16_MASK_COMPLETE_RESULT     STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
+
+    #define SBGEMM_BLOCK_KERNEL_NN_32x8xK    sbgemm_block_kernel_nn_32x8xK_alpha
+    #define SBGEMM_BLOCK_KERNEL_NN_16x8xK    sbgemm_block_kernel_nn_16x8xK_alpha
+    #define SBGEMM_BLOCK_KERNEL_NN_32xNx32   sbgemm_block_kernel_nn_32xNx32_alpha
+    #define SBGEMM_BLOCK_KERNEL_NN_16xNx32   sbgemm_block_kernel_nn_16xNx32_alpha
+
+    #define SBGEMM_BLOCK_KERNEL_NT_32x8xK    SBGEMM_BLOCK_KERNEL_NN_32x8xK
+    #define SBGEMM_BLOCK_KERNEL_NT_16x8xK    SBGEMM_BLOCK_KERNEL_NN_16x8xK
+    #define SBGEMM_BLOCK_KERNEL_NT_32xNxK    sbgemm_block_kernel_nt_32xNxK_alpha
+    #define SBGEMM_BLOCK_KERNEL_NT_16xNxK    sbgemm_block_kernel_nt_16xNxK_alpha
+
+    #define SBGEMM_BLOCK_KERNEL_TN_32x8xK    sbgemm_block_kernel_tn_32x8xK_alpha
+    #define SBGEMM_BLOCK_KERNEL_TN_16x8xK    sbgemm_block_kernel_tn_16x8xK_alpha
+    #define SBGEMM_BLOCK_KERNEL_TN_32xNx32   sbgemm_block_kernel_tn_32xNx32_alpha
+    #define SBGEMM_BLOCK_KERNEL_TN_16xNx32   sbgemm_block_kernel_tn_16xNx32_alpha
+
+    #define SBGEMM_BLOCK_KERNEL_TT_32x8xK    SBGEMM_BLOCK_KERNEL_TN_32x8xK
+    #define SBGEMM_BLOCK_KERNEL_TT_16x8xK    SBGEMM_BLOCK_KERNEL_TN_16x8xK
+    #define SBGEMM_BLOCK_KERNEL_TT_32xNxK    sbgemm_block_kernel_tt_32xNxK_alpha
+    #define SBGEMM_BLOCK_KERNEL_TT_16xNxK    sbgemm_block_kernel_tt_16xNxK_alpha
+
+    #define SBGEMM_BLOCKING_KERNEL_NN        sbgemm_blocking_kernel_nn_alpha
+    #define SBGEMM_BLOCKING_KERNEL_NT        sbgemm_blocking_kernel_nt_alpha
+    #define SBGEMM_BLOCKING_KERNEL_TN        sbgemm_blocking_kernel_tn_alpha
+    #define SBGEMM_BLOCKING_KERNEL_TT        sbgemm_blocking_kernel_tt_alpha
 #else                  // ALPHA is ONE
-    #define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ONE_ONE
-    #define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ONE_ONE
-    #define SBGEMM_BLOCK_KERNEL_32x8x32   sbgemm_block_kernel_32x8x32_one
-    #define SBGEMM_BLOCK_KERNEL_16x8x32   sbgemm_block_kernel_16x8x32_one
-    #define SBGEMM_BLOCK_KERNEL_32xNx32   sbgemm_block_kernel_32xNx32_one
-    #define SBGEMM_BLOCK_KERNEL_16xNx32   sbgemm_block_kernel_16xNx32_one
-    #define SBGEMM_BLOCKING_KERNEL_2      sbgemm_blocking_kernel_2_one
+    #define STORE16_COMPLETE_RESULT          STORE16_COMPLETE_RESULT_ONE_ONE
+    #define STORE16_MASK_COMPLETE_RESULT     STORE16_MASK_COMPLETE_RESULT_ONE_ONE
+
+    #define SBGEMM_BLOCK_KERNEL_NN_32x8xK    sbgemm_block_kernel_nn_32x8xK_one
+    #define SBGEMM_BLOCK_KERNEL_NN_16x8xK    sbgemm_block_kernel_nn_16x8xK_one
+    #define SBGEMM_BLOCK_KERNEL_NN_32xNx32   sbgemm_block_kernel_nn_32xNx32_one
+    #define SBGEMM_BLOCK_KERNEL_NN_16xNx32   sbgemm_block_kernel_nn_16xNx32_one
+
+    #define SBGEMM_BLOCK_KERNEL_NT_32x8xK    SBGEMM_BLOCK_KERNEL_NN_32x8xK
+    #define SBGEMM_BLOCK_KERNEL_NT_16x8xK    SBGEMM_BLOCK_KERNEL_NN_16x8xK
+    #define SBGEMM_BLOCK_KERNEL_NT_32xNxK    sbgemm_block_kernel_nt_32xNxK_one
+    #define SBGEMM_BLOCK_KERNEL_NT_16xNxK    sbgemm_block_kernel_nt_16xNxK_one
+
+    #define SBGEMM_BLOCK_KERNEL_TN_32x8xK    sbgemm_block_kernel_tn_32x8xK_one
+    #define SBGEMM_BLOCK_KERNEL_TN_16x8xK    sbgemm_block_kernel_tn_16x8xK_one
+    #define SBGEMM_BLOCK_KERNEL_TN_32xNx32   sbgemm_block_kernel_tn_32xNx32_one
+    #define SBGEMM_BLOCK_KERNEL_TN_16xNx32   sbgemm_block_kernel_tn_16xNx32_one
+
+    #define SBGEMM_BLOCK_KERNEL_TT_32x8xK    SBGEMM_BLOCK_KERNEL_TN_32x8xK
+    #define SBGEMM_BLOCK_KERNEL_TT_16x8xK    SBGEMM_BLOCK_KERNEL_TN_16x8xK
+    #define SBGEMM_BLOCK_KERNEL_TT_32xNxK    sbgemm_block_kernel_tt_32xNxK_one
+    #define SBGEMM_BLOCK_KERNEL_TT_16xNxK    sbgemm_block_kernel_tt_16xNxK_one
+
+    #define SBGEMM_BLOCKING_KERNEL_NN        sbgemm_blocking_kernel_nn_one
+    #define SBGEMM_BLOCKING_KERNEL_NT        sbgemm_blocking_kernel_nt_one
+    #define SBGEMM_BLOCKING_KERNEL_TN        sbgemm_blocking_kernel_tn_one
+    #define SBGEMM_BLOCKING_KERNEL_TT        sbgemm_blocking_kernel_tt_one
 #endif
 
+extern bfloat16 * block_A;
+extern bfloat16 * block_B;
 
+/* --------------------------------------------- NN kernels ------------------------------------------ */
 // SBGEMM Kernel for 16<M<=32, N=8, K can be any number, but the processing will take 32 as a base
 #ifndef ONE_ALPHA      // ALPHA is not ONE
-void sbgemm_block_kernel_32x8x32_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+void sbgemm_block_kernel_nn_32x8xK_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
 #else                  // ALPHA is ONE
-void sbgemm_block_kernel_32x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+void sbgemm_block_kernel_nn_32x8xK_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
 #endif
 {
-    int  SHUFFLE_MAGIC_NO = 0x39;
-    BLASLONG tag_k_32x = k & (~31);
-    BLASLONG idxA_base = 0;
-    BLASLONG idxB_base = 0;
-    BLASLONG width = 32;
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
 
 #ifndef ONE_ALPHA
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
@@ -73,65 +143,42 @@ void sbgemm_block_kernel_32x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat
     result_512_14 = _mm512_setzero_ps();
     result_512_15 = _mm512_setzero_ps();
 
-    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) {
-        // Load B with unroll 8
-        idxB_base = idx_k << 3;
-        arrayB_512_0 = _mm512_loadu_si512(&B[idxB_base + 32*0]);
-        arrayB_512_1 = _mm512_loadu_si512(&B[idxB_base + 32*1]);
-        arrayB_512_2 = _mm512_loadu_si512(&B[idxB_base + 32*2]);
-        arrayB_512_3 = _mm512_loadu_si512(&B[idxB_base + 32*3]);
-        arrayB_512_4 = _mm512_loadu_si512(&B[idxB_base + 32*4]);
-        arrayB_512_5 = _mm512_loadu_si512(&B[idxB_base + 32*5]);
-        arrayB_512_6 = _mm512_loadu_si512(&B[idxB_base + 32*6]);
-        arrayB_512_7 = _mm512_loadu_si512(&B[idxB_base + 32*7]);
-
-        if (idx_k == tag_k_32x) {width = k - tag_k_32x;}
-
-        for (BLASLONG idx = 0; idx < width;) {
-            // Each two rows are a group for 32-pair bf16 elements
-            idxA_base = idx << 5;
-            arrayA_512_0 = _mm512_loadu_si512(&A[idxA_base]);
-            arrayA_512_1 = _mm512_loadu_si512(&A[idxA_base + 32]);
-
-            result_512_0  = _mm512_dpbf16_ps(result_512_0,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0)));
-            result_512_1  = _mm512_dpbf16_ps(result_512_1,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1)));
-            result_512_2  = _mm512_dpbf16_ps(result_512_2,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2)));
-            result_512_3  = _mm512_dpbf16_ps(result_512_3,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3)));
-            result_512_4  = _mm512_dpbf16_ps(result_512_4,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4)));
-            result_512_5  = _mm512_dpbf16_ps(result_512_5,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5)));
-            result_512_6  = _mm512_dpbf16_ps(result_512_6,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6)));
-            result_512_7  = _mm512_dpbf16_ps(result_512_7,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7)));
-            result_512_8  = _mm512_dpbf16_ps(result_512_8,  (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0)));
-            result_512_9  = _mm512_dpbf16_ps(result_512_9,  (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1)));
-            result_512_10 = _mm512_dpbf16_ps(result_512_10, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2)));
-            result_512_11 = _mm512_dpbf16_ps(result_512_11, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3)));
-            result_512_12 = _mm512_dpbf16_ps(result_512_12, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4)));
-            result_512_13 = _mm512_dpbf16_ps(result_512_13, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5)));
-            result_512_14 = _mm512_dpbf16_ps(result_512_14, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6)));
-            result_512_15 = _mm512_dpbf16_ps(result_512_15, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7)));
-
-            arrayB_512_0 = _mm512_shuffle_epi32(arrayB_512_0, SHUFFLE_MAGIC_NO);
-            arrayB_512_1 = _mm512_shuffle_epi32(arrayB_512_1, SHUFFLE_MAGIC_NO);
-            arrayB_512_2 = _mm512_shuffle_epi32(arrayB_512_2, SHUFFLE_MAGIC_NO);
-            arrayB_512_3 = _mm512_shuffle_epi32(arrayB_512_3, SHUFFLE_MAGIC_NO);
-            arrayB_512_4 = _mm512_shuffle_epi32(arrayB_512_4, SHUFFLE_MAGIC_NO);
-            arrayB_512_5 = _mm512_shuffle_epi32(arrayB_512_5, SHUFFLE_MAGIC_NO);
-            arrayB_512_6 = _mm512_shuffle_epi32(arrayB_512_6, SHUFFLE_MAGIC_NO);
-            arrayB_512_7 = _mm512_shuffle_epi32(arrayB_512_7, SHUFFLE_MAGIC_NO);
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Each two rows are a group for 32-pair bf16 elements
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+        arrayA_512_1 = _mm512_loadu_si512(A_addr + 32);
+
+        _MM512_BROADCASTD_EPI32(B_addr + 0,  arrayB_512_0);
+        _MM512_BROADCASTD_EPI32(B_addr + 2,  arrayB_512_1);
+        _MM512_BROADCASTD_EPI32(B_addr + 4,  arrayB_512_2);
+        _MM512_BROADCASTD_EPI32(B_addr + 6,  arrayB_512_3);
+        _MM512_BROADCASTD_EPI32(B_addr + 8,  arrayB_512_4);
+        _MM512_BROADCASTD_EPI32(B_addr + 10, arrayB_512_5);
+        _MM512_BROADCASTD_EPI32(B_addr + 12, arrayB_512_6);
+        _MM512_BROADCASTD_EPI32(B_addr + 14, arrayB_512_7);
+
+        result_512_0  = _mm512_dpbf16_ps(result_512_0,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_0);
+        result_512_1  = _mm512_dpbf16_ps(result_512_1,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_1);
+        result_512_2  = _mm512_dpbf16_ps(result_512_2,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_2);
+        result_512_3  = _mm512_dpbf16_ps(result_512_3,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_3);
+        result_512_4  = _mm512_dpbf16_ps(result_512_4,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_4);
+        result_512_5  = _mm512_dpbf16_ps(result_512_5,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_5);
+        result_512_6  = _mm512_dpbf16_ps(result_512_6,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_6);
+        result_512_7  = _mm512_dpbf16_ps(result_512_7,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_7);
+
+        result_512_8  = _mm512_dpbf16_ps(result_512_8,  (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_0);
+        result_512_9  = _mm512_dpbf16_ps(result_512_9,  (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_1);
+        result_512_10 = _mm512_dpbf16_ps(result_512_10, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_2);
+        result_512_11 = _mm512_dpbf16_ps(result_512_11, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_3);
+        result_512_12 = _mm512_dpbf16_ps(result_512_12, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_4);
+        result_512_13 = _mm512_dpbf16_ps(result_512_13, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_5);
+        result_512_14 = _mm512_dpbf16_ps(result_512_14, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_6);
+        result_512_15 = _mm512_dpbf16_ps(result_512_15, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_7);
 
-            idx += 2;
-            // Every 4 loops we need to switch to next 128 bits of arrayB registers
-            if ((idx & (~7)) == idx) {
-                arrayB_512_0 = _mm512_shuffle_i32x4(arrayB_512_0, arrayB_512_0, SHUFFLE_MAGIC_NO);
-                arrayB_512_1 = _mm512_shuffle_i32x4(arrayB_512_1, arrayB_512_1, SHUFFLE_MAGIC_NO);
-                arrayB_512_2 = _mm512_shuffle_i32x4(arrayB_512_2, arrayB_512_2, SHUFFLE_MAGIC_NO);
-                arrayB_512_3 = _mm512_shuffle_i32x4(arrayB_512_3, arrayB_512_3, SHUFFLE_MAGIC_NO);
-                arrayB_512_4 = _mm512_shuffle_i32x4(arrayB_512_4, arrayB_512_4, SHUFFLE_MAGIC_NO);
-                arrayB_512_5 = _mm512_shuffle_i32x4(arrayB_512_5, arrayB_512_5, SHUFFLE_MAGIC_NO);
-                arrayB_512_6 = _mm512_shuffle_i32x4(arrayB_512_6, arrayB_512_6, SHUFFLE_MAGIC_NO);
-                arrayB_512_7 = _mm512_shuffle_i32x4(arrayB_512_7, arrayB_512_7, SHUFFLE_MAGIC_NO);
-            }
-        }
+        // Load B with unroll 8
+        B_addr += 16;
+        // Load A with unroll 64
+        A_addr += 64;
     }
 
     if (m != 32) {
@@ -141,81 +188,80 @@ void sbgemm_block_kernel_32x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat
         result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8);
         result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9);
         result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9);
-        STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0]))
-        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16]), tail_mask)
-        STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1]))
-        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16]), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*1))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*1 + 16), tail_mask)
         result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10);
         result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10);
         result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11);
         result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11);
-        STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2]))
-        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16]), tail_mask)
-        STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3]))
-        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16]), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*2))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*2 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*3))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*3 + 16), tail_mask)
         result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12);
         result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12);
         result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13);
         result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13);
-        STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4]))
-        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16]), tail_mask)
-        STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5]))
-        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16]), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*4))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*4 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*5))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*5 + 16), tail_mask)
         result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14);
         result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14);
         result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15);
         result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15);
-        STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6]))
-        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16]), tail_mask)
-        STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7]))
-        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16]), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*6))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*6 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*7))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*7 + 16), tail_mask)
     } else {
         result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8);
         result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8);
         result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9);
         result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9);
-        STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0]))
-        STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16]))
-        STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1]))
-        STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16]))
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr))
+        STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + 16))
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*1))
+        STORE16_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*1 + 16))
         result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10);
         result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10);
         result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11);
         result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11);
-        STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2]))
-        STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16]))
-        STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3]))
-        STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16]))
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*2))
+        STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*2 + 16))
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*3))
+        STORE16_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*3 + 16))
         result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12);
         result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12);
         result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13);
         result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13);
-        STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4]))
-        STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16]))
-        STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5]))
-        STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16]))
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*4))
+        STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*4 + 16))
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*5))
+        STORE16_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*5 + 16))
         result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14);
         result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14);
         result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15);
         result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15);
-        STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6]))
-        STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16]))
-        STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7]))
-        STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16]))
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*6))
+        STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*6 + 16))
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*7))
+        STORE16_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*7 + 16))
     }
 }
 
-// SBGEMM Kernel for M<=16, N=8, K can be any number, but the processing will take 32 as a base
+// SBGEMM Kernel for M<=16, N=8, K can be any number
 #ifndef ONE_ALPHA      // ALPHA is not ONE
-void sbgemm_block_kernel_16x8x32_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+void sbgemm_block_kernel_nn_16x8xK_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
 #else                  // ALPHA is ONE
-void sbgemm_block_kernel_16x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+void sbgemm_block_kernel_nn_16x8xK_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
 #endif
 {
-    int  SHUFFLE_MAGIC_NO = 0x39;
-    BLASLONG tag_k_32x = k & (~31);
-    BLASLONG idxB_base = 0;
-    BLASLONG width = 32;
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
 
 #ifndef ONE_ALPHA
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
@@ -234,110 +280,87 @@ void sbgemm_block_kernel_16x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat
     result_512_6  = _mm512_setzero_ps();
     result_512_7  = _mm512_setzero_ps();
 
-    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) {
-        // Load B with unroll 8
-        idxB_base = idx_k << 3;
-        arrayB_512_0 = _mm512_loadu_si512(&B[idxB_base + 32*0]);
-        arrayB_512_1 = _mm512_loadu_si512(&B[idxB_base + 32*1]);
-        arrayB_512_2 = _mm512_loadu_si512(&B[idxB_base + 32*2]);
-        arrayB_512_3 = _mm512_loadu_si512(&B[idxB_base + 32*3]);
-        arrayB_512_4 = _mm512_loadu_si512(&B[idxB_base + 32*4]);
-        arrayB_512_5 = _mm512_loadu_si512(&B[idxB_base + 32*5]);
-        arrayB_512_6 = _mm512_loadu_si512(&B[idxB_base + 32*6]);
-        arrayB_512_7 = _mm512_loadu_si512(&B[idxB_base + 32*7]);
-
-        if (idx_k == tag_k_32x) {width = k - tag_k_32x;}
-
-        for (BLASLONG idx = 0; idx < width;) {
-            // Each two rows are a group for 32-pair bf16 elements
-            // Load two rows into a 512 register
-            arrayA_512_0 = _mm512_loadu_si512(&A[idx<<4]);
-
-            result_512_0  = _mm512_dpbf16_ps(result_512_0,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0)));
-            result_512_1  = _mm512_dpbf16_ps(result_512_1,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1)));
-            result_512_2  = _mm512_dpbf16_ps(result_512_2,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2)));
-            result_512_3  = _mm512_dpbf16_ps(result_512_3,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3)));
-            result_512_4  = _mm512_dpbf16_ps(result_512_4,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4)));
-            result_512_5  = _mm512_dpbf16_ps(result_512_5,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5)));
-            result_512_6  = _mm512_dpbf16_ps(result_512_6,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6)));
-            result_512_7  = _mm512_dpbf16_ps(result_512_7,  (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7)));
-
-            arrayB_512_0 = _mm512_shuffle_epi32(arrayB_512_0, SHUFFLE_MAGIC_NO);
-            arrayB_512_1 = _mm512_shuffle_epi32(arrayB_512_1, SHUFFLE_MAGIC_NO);
-            arrayB_512_2 = _mm512_shuffle_epi32(arrayB_512_2, SHUFFLE_MAGIC_NO);
-            arrayB_512_3 = _mm512_shuffle_epi32(arrayB_512_3, SHUFFLE_MAGIC_NO);
-            arrayB_512_4 = _mm512_shuffle_epi32(arrayB_512_4, SHUFFLE_MAGIC_NO);
-            arrayB_512_5 = _mm512_shuffle_epi32(arrayB_512_5, SHUFFLE_MAGIC_NO);
-            arrayB_512_6 = _mm512_shuffle_epi32(arrayB_512_6, SHUFFLE_MAGIC_NO);
-            arrayB_512_7 = _mm512_shuffle_epi32(arrayB_512_7, SHUFFLE_MAGIC_NO);
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Each two rows are a group for 32-pair bf16 elements
+        // Load two rows into a 512 register
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+
+        _MM512_BROADCASTD_EPI32(B_addr + 0,  arrayB_512_0);
+        _MM512_BROADCASTD_EPI32(B_addr + 2,  arrayB_512_1);
+        _MM512_BROADCASTD_EPI32(B_addr + 4,  arrayB_512_2);
+        _MM512_BROADCASTD_EPI32(B_addr + 6,  arrayB_512_3);
+        _MM512_BROADCASTD_EPI32(B_addr + 8,  arrayB_512_4);
+        _MM512_BROADCASTD_EPI32(B_addr + 10, arrayB_512_5);
+        _MM512_BROADCASTD_EPI32(B_addr + 12, arrayB_512_6);
+        _MM512_BROADCASTD_EPI32(B_addr + 14, arrayB_512_7);
+
+        result_512_0  = _mm512_dpbf16_ps(result_512_0,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_0);
+        result_512_1  = _mm512_dpbf16_ps(result_512_1,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_1);
+        result_512_2  = _mm512_dpbf16_ps(result_512_2,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_2);
+        result_512_3  = _mm512_dpbf16_ps(result_512_3,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_3);
+        result_512_4  = _mm512_dpbf16_ps(result_512_4,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_4);
+        result_512_5  = _mm512_dpbf16_ps(result_512_5,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_5);
+        result_512_6  = _mm512_dpbf16_ps(result_512_6,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_6);
+        result_512_7  = _mm512_dpbf16_ps(result_512_7,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_7);
 
-            idx += 2;
-            // Every 4 loops we need to switch to next 128 bits of arrayB registers
-            if ((idx & (~7)) == idx) {
-                arrayB_512_0 = _mm512_shuffle_i32x4(arrayB_512_0, arrayB_512_0, SHUFFLE_MAGIC_NO);
-                arrayB_512_1 = _mm512_shuffle_i32x4(arrayB_512_1, arrayB_512_1, SHUFFLE_MAGIC_NO);
-                arrayB_512_2 = _mm512_shuffle_i32x4(arrayB_512_2, arrayB_512_2, SHUFFLE_MAGIC_NO);
-                arrayB_512_3 = _mm512_shuffle_i32x4(arrayB_512_3, arrayB_512_3, SHUFFLE_MAGIC_NO);
-                arrayB_512_4 = _mm512_shuffle_i32x4(arrayB_512_4, arrayB_512_4, SHUFFLE_MAGIC_NO);
-                arrayB_512_5 = _mm512_shuffle_i32x4(arrayB_512_5, arrayB_512_5, SHUFFLE_MAGIC_NO);
-                arrayB_512_6 = _mm512_shuffle_i32x4(arrayB_512_6, arrayB_512_6, SHUFFLE_MAGIC_NO);
-                arrayB_512_7 = _mm512_shuffle_i32x4(arrayB_512_7, arrayB_512_7, SHUFFLE_MAGIC_NO);
-            }
-        }
+        // Load B with unroll 8
+        B_addr += 16;
+        // Load A with unroll 16
+        A_addr += 32;
     }
 
     if (m != 16) {
-        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m));
-        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m));
 
         result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8);
         result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8);
         result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8);
         result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8);
-        STORE16_MASK_COMPLETE_RESULT(result_512_0, (&C[ldc*0]), tail_mask)
-        STORE16_MASK_COMPLETE_RESULT(result_512_1, (&C[ldc*1]), tail_mask)
-        STORE16_MASK_COMPLETE_RESULT(result_512_2, (&C[ldc*2]), tail_mask)
-        STORE16_MASK_COMPLETE_RESULT(result_512_3, (&C[ldc*3]), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_0, (C_addr), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_1, (C_addr + ldc*1), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3), tail_mask)
         result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8);
         result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8);
         result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8);
         result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8);
-        STORE16_MASK_COMPLETE_RESULT(result_512_4, (&C[ldc*4]), tail_mask)
-        STORE16_MASK_COMPLETE_RESULT(result_512_5, (&C[ldc*5]), tail_mask)
-        STORE16_MASK_COMPLETE_RESULT(result_512_6, (&C[ldc*6]), tail_mask)
-        STORE16_MASK_COMPLETE_RESULT(result_512_7, (&C[ldc*7]), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7), tail_mask)
     } else {
         result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8);
         result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8);
         result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8);
         result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8);
-        STORE16_COMPLETE_RESULT(result_512_0, (&C[ldc*0]))
-        STORE16_COMPLETE_RESULT(result_512_1, (&C[ldc*1]))
-        STORE16_COMPLETE_RESULT(result_512_2, (&C[ldc*2]))
-        STORE16_COMPLETE_RESULT(result_512_3, (&C[ldc*3]))
+        STORE16_COMPLETE_RESULT(result_512_0, (C_addr))
+        STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc*1))
+        STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2))
+        STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3))
         result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8);
         result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8);
         result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8);
         result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8);
-        STORE16_COMPLETE_RESULT(result_512_4, (&C[ldc*4]))
-        STORE16_COMPLETE_RESULT(result_512_5, (&C[ldc*5]))
-        STORE16_COMPLETE_RESULT(result_512_6, (&C[ldc*6]))
-        STORE16_COMPLETE_RESULT(result_512_7, (&C[ldc*7]))
+        STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4))
+        STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5))
+        STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6))
+        STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7))
     }
 }
 
 // SBGEMM Kernel for 16<M<=32, N<8, K can be any number, but the processing will take 32 as a base
 #ifndef ONE_ALPHA      // ALPHA is not ONE
-void sbgemm_block_kernel_32xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+void sbgemm_block_kernel_nn_32xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
 #else                  // ALPHA is ONE
-void sbgemm_block_kernel_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+void sbgemm_block_kernel_nn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
 #endif
 {
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
     int  SHUFFLE_MAGIC_NO = 0x39;
     BLASLONG tag_k_32x = k & (~31);
-    BLASLONG idxA_base = 0;
-    BLASLONG idxB_base = 0;
-    BLASLONG width = 32;
 
 #ifndef ONE_ALPHA
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
@@ -357,20 +380,48 @@ void sbgemm_block_kernel_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float a
         result_512[i+1]  = _mm512_setzero_ps();
     }
 
-    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) {
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
         // Load B with unroll n
         for (int i = 0; i < n; i ++) {
-            arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]);
-            idxB_base += 32;
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        for (BLASLONG idx = 0; idx < 32;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            arrayA_512[0] = _mm512_loadu_si512(A_addr);
+            arrayA_512[1] = _mm512_loadu_si512(A_addr + 32);
+            A_addr += 64;
+
+            for (int i = 0; i < n; i++) {
+                result_512[i]   = _mm512_dpbf16_ps(result_512[i]  , (__m512bh) arrayA_512[0], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8], (__m512bh) arrayA_512[1], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i]   = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
         }
+    }
 
-        if (idx_k == tag_k_32x) {width = k - tag_k_32x;}
+    if (tag_k_32x != k) {
+        // Load B with unroll n
+        for (int i = 0; i < n; i ++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
 
+        BLASLONG width = k - tag_k_32x;
         for (BLASLONG idx = 0; idx < width;) {
             // Each two rows are a group for 32-pair bf16 elements
-            idxA_base = idx << 5;
-            arrayA_512[0] = _mm512_loadu_si512(&A[idxA_base]);
-            arrayA_512[1] = _mm512_loadu_si512(&A[idxA_base + 32]);
+            arrayA_512[0] = _mm512_loadu_si512(A_addr);
+            arrayA_512[1] = _mm512_loadu_si512(A_addr + 32);
+            A_addr += 64;
 
             for (int i = 0; i < n; i++) {
                 result_512[i]   = _mm512_dpbf16_ps(result_512[i]  , (__m512bh) arrayA_512[0], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
@@ -389,35 +440,36 @@ void sbgemm_block_kernel_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float a
     }
 
     if (m != 32) {
-        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (32-m));
-        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (32-m));
         for (int i = 0; i < n; i++) {
             result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]);
             result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]);
-            STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i]))
-            STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16]), tail_mask)
+            STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i))
+            STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16), tail_mask)
         }
     } else {
         for (int i = 0; i < n; i++) {
             result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]);
             result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]);
-            STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i]))
-            STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16]))
+            STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i))
+            STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16))
         }
     }
 }
 
 // SBGEMM Kernel for 16<=M, N<8, K can be any number, but the processing will take 32 as a base
 #ifndef ONE_ALPHA      // ALPHA is not ONE
-void sbgemm_block_kernel_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+void sbgemm_block_kernel_nn_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
 #else                  // ALPHA is ONE
-void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+void sbgemm_block_kernel_nn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
 #endif
 {
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
     int  SHUFFLE_MAGIC_NO = 0x39;
     BLASLONG tag_k_32x = k & (~31);
-    BLASLONG idxB_base = 0;
-    BLASLONG width = 32;
 
 #ifndef ONE_ALPHA
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
@@ -432,21 +484,49 @@ void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float a
         result_512[i+1]  = _mm512_setzero_ps();
     }
 
-    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) {
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
         // Load B with unroll n
-        for (int i = 0; i < n; i ++) {
-            arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]);
-            idxB_base += 32;
+        for (int i = 0; i < n; i++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        for (BLASLONG idx = 0; idx < 32;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            // Load two rows into a 512 register
+            arrayA_512 = _mm512_loadu_si512(A_addr);
+            A_addr += 32;
+
+            for (int i = 0; i < n; i ++) {
+                result_512[i]  = _mm512_dpbf16_ps(result_512[i],  (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
         }
+    }
 
-        if (idx_k == tag_k_32x) {width = k - tag_k_32x;}
+    if (tag_k_32x != k) {
+        // Load B with unroll n
+        for (int i = 0; i < n; i++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
 
+        BLASLONG width = k - tag_k_32x;
         for (BLASLONG idx = 0; idx < width;) {
             // Each two rows are a group for 32-pair bf16 elements
             // Load two rows into a 512 register
-            arrayA_512 = _mm512_loadu_si512(&A[idx<<4]);
+            arrayA_512 = _mm512_loadu_si512(A_addr);
+            A_addr += 32;
 
-            for (int i = 0; i < n; i ++) {
+            for (int i = 0; i < n; i++) {
                 result_512[i]  = _mm512_dpbf16_ps(result_512[i],  (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
                 arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
             }
@@ -462,23 +542,24 @@ void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float a
     }
 
     if (m != 16) {
-        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m));
-        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m));
         for (int i = 0; i < n; i++) {
             result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8);
-            STORE16_MASK_COMPLETE_RESULT(result_512[i], (&C[ldc*i]), tail_mask)
+            STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask)
         }
     } else {
         for (int i = 0; i < n; i++) {
             result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8);
-            STORE16_COMPLETE_RESULT(result_512[i], (&C[ldc*i]))
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
         }
     }
 }
+
+
 #ifndef ONE_ALPHA      // ALPHA is not ONE
-void sbgemm_blocking_kernel_2_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+void sbgemm_blocking_kernel_nn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
 #else                  // ALPHA is ONE
-void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+void sbgemm_blocking_kernel_nn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
 #endif
 {
     BLASLONG m_step, n_step, k_step, k_step_round32;
@@ -499,63 +580,52 @@ void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha,
         while (n_from < N) {
             for (BLASLONG idx_k = 0; idx_k < K;) {
                 // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
-                COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, 0), lda, block_A);
-                // TODO: MT
+                COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, 0), lda, block_A);
                 for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
                     // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
                     COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
-                    SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    SBGEMM_BLOCK_KERNEL_NN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
                 }
 
                 if (tag_n_Nx != n_to) {
                     n_step = n_to - tag_n_Nx;
                     COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
-                    SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    SBGEMM_BLOCK_KERNEL_NN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
                 }
 
                 for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) {
-                    COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, idx_m), lda, block_A);
+                    COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, idx_m), lda, block_A);
                     for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
-                        SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc);
+                        SBGEMM_BLOCK_KERNEL_NN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc);
                     }
 
                     if (tag_n_Nx != n_to) {
                         n_step = n_to - tag_n_Nx;
-                        SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc);
+                        SBGEMM_BLOCK_KERNEL_NN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc);
                     }
                 }
 
                 if (tag_m_Nx != M) {
                     m_step = M - tag_m_Nx;
                     if (m_step > 16) {
-                        COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
-                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
-                            SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
-                        }
-
-                        if (tag_n_Nx != n_to) {
-                            n_step = n_to - tag_n_Nx;
-                            SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
-                        }
-                    } else if (m_step == 16) {
-                        COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
+                        COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
                         for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
-                            SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                            SBGEMM_BLOCK_KERNEL_NN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
                         }
 
                         if (tag_n_Nx != n_to) {
                             n_step = n_to - tag_n_Nx;
-                            SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                            SBGEMM_BLOCK_KERNEL_NN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
                         }
                     } else {
-                        COL_MAJOR_INCOPY_KERNEL_Kx16m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
+                        COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
                         for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
-                            SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                            SBGEMM_BLOCK_KERNEL_NN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
                         }
 
                         if (tag_n_Nx != n_to) {
                             n_step = n_to - tag_n_Nx;
-                            SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                            SBGEMM_BLOCK_KERNEL_NN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
                         }
                     }
                 }
@@ -573,22 +643,274 @@ void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha,
             tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));           
         }
     } else {
-        m_step = M - tag_m_Nx;
+        m_step = M;
+        if (m_step > 16) {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, 0), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        } else {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, 0), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        }
+    }
+}
+/* ----------------------------------------- End of NN kernels --------------------------------------- */
+
+/* --------------------------------------------- NT kernels ------------------------------------------ */
+// SBGEMM Kernel for 16<M<=32, N<8, K can be any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_nt_32xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_nt_32xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0, arrayA_512_1;
+    __m512i arrayB_512[8];
+    __m512  result_512[16];
+    __m512  result_512_tmp_0, result_512_tmp_1;
+
+    __m512i M512_EPI32_8      = _mm512_set1_epi32(8);
+    __m512i shuffle_idx_base0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
+    __m512i shuffle_idx_base1 = _mm512_add_epi32(shuffle_idx_base0, M512_EPI32_8);
+
+    result_512[0]  = _mm512_setzero_ps();
+    result_512[1]  = _mm512_setzero_ps();
+    result_512[2]  = _mm512_setzero_ps();
+    result_512[3]  = _mm512_setzero_ps();
+    result_512[4]  = _mm512_setzero_ps();
+    result_512[5]  = _mm512_setzero_ps();
+    result_512[6]  = _mm512_setzero_ps();
+    result_512[7]  = _mm512_setzero_ps();
+    result_512[8]  = _mm512_setzero_ps();
+    result_512[9]  = _mm512_setzero_ps();
+    result_512[10] = _mm512_setzero_ps();
+    result_512[11] = _mm512_setzero_ps();
+    result_512[12] = _mm512_setzero_ps();
+    result_512[13] = _mm512_setzero_ps();
+    result_512[14] = _mm512_setzero_ps();
+    result_512[15] = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Each two rows are a group for 32-pair bf16 elements
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+        arrayA_512_1 = _mm512_loadu_si512(A_addr + 32);
+        A_addr += 64;
+
+        for (int i = 0; i < n; i ++) {
+            _MM512_BROADCASTD_EPI32(B_addr + i*2,  arrayB_512[i]);
+        }
+        B_addr += 16;
+
+        for (int i = 0; i < n; i ++) {
+            result_512[i] = _mm512_dpbf16_ps(result_512[i],  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]);
+            result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8],  (__m512bh) arrayA_512_1, (__m512bh) arrayB_512[i]);
+        }
+    }
+
+    if (m != 32) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (32-m));
+        for (int i = 0; i < n; i ++) {
+            result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]);
+            result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]);
+            STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i))
+            STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i ++) {
+            result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]);
+            result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]);
+            STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i))
+            STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16))
+        }
+    }
+}
+
+// SBGEMM Kernel for M<=16, N<8, K can be any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_nt_16xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_nt_16xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0;
+    __m512i arrayB_512[8];
+    __m512  result_512[8];
+
+    result_512[0]  = _mm512_setzero_ps();
+    result_512[1]  = _mm512_setzero_ps();
+    result_512[2]  = _mm512_setzero_ps();
+    result_512[3]  = _mm512_setzero_ps();
+    result_512[4]  = _mm512_setzero_ps();
+    result_512[5]  = _mm512_setzero_ps();
+    result_512[6]  = _mm512_setzero_ps();
+    result_512[7]  = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Each two rows are a group for 16-pair bf16 elements
+        // Load two rows into a 512 register
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+        A_addr += 32;
+
+        for (int i = 0; i < n; i ++) {
+            _MM512_BROADCASTD_EPI32(B_addr + i*2,  arrayB_512[i]);
+        }
+        B_addr += 16;
+
+        for (int i = 0; i < n; i ++) {
+            result_512[i] = _mm512_dpbf16_ps(result_512[i],  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]);
+        }
+    }
+
+    if (m != 16) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m));
+        for (int i = 0; i < n; i++) {
+            result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8);
+            STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i++) {
+            result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8);
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+        }
+    }
+}
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_blocking_kernel_nt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#else                  // ALPHA is ONE
+void sbgemm_blocking_kernel_nt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#endif
+{
+    BLASLONG m_step, n_step, k_step, k_step_round32;
+    BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1));
+
+    BLASLONG n_from, n_to;
+    BLASLONG tag_n_Nx;
+
+    n_from = 0;
+    n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N;
+    tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+
+    k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K;
+    k_step_round32 = k_step & (~31);
+    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+
+    if (M >= BF16_BLOCK_THRES_M) {
         while (n_from < N) {
             for (BLASLONG idx_k = 0; idx_k < K;) {
                 // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
-                COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, 0), lda, block_A);
-                // TODO: MT
+                COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, 0), lda, block_A);
                 for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
                     // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
-                    COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
-                    SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_NT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
                 }
 
                 if (tag_n_Nx != n_to) {
                     n_step = n_to - tag_n_Nx;
-                    COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
-                    SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_NT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                }
+
+                for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) {
+                    COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, idx_m), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        SBGEMM_BLOCK_KERNEL_NT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        SBGEMM_BLOCK_KERNEL_NT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc);
+                    }
+                }
+
+                if (tag_m_Nx != M) {
+                    m_step = M - tag_m_Nx;
+                    if (m_step > 16) {
+                        COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_NT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_NT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    } else {
+                        COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_NT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_NT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    }
                 }
 
                 idx_k += k_step;
@@ -597,13 +919,884 @@ void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha,
                 k_step_round32 = k_step & (~31);
                 k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
             }
+
             n_from = n_to;
             n_to += BF16_BLOCK_THRES_N;
             n_to = (n_to > N) ? N : n_to;
             tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
         }
+    } else {
+        m_step = M;
+        if (m_step > 16) {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, 0), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        } else {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, 0), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        }
     }
 }
+/* ----------------------------------------- End of NT kernels --------------------------------------- */
+
+/* --------------------------------------------- TN kernels ------------------------------------------ */
+// SBGEMM Kernel for 16<M<=32, N=8, K=Any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_tn_32x8xK_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_tn_32x8xK_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0, arrayA_512_1;
+    __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7;
+    __m512  result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7,
+            result_512_8, result_512_9, result_512_10, result_512_11, result_512_12, result_512_13, result_512_14, result_512_15;
+
+    result_512_0  = _mm512_setzero_ps();
+    result_512_1  = _mm512_setzero_ps();
+    result_512_2  = _mm512_setzero_ps();
+    result_512_3  = _mm512_setzero_ps();
+    result_512_4  = _mm512_setzero_ps();
+    result_512_5  = _mm512_setzero_ps();
+    result_512_6  = _mm512_setzero_ps();
+    result_512_7  = _mm512_setzero_ps();
+    result_512_8  = _mm512_setzero_ps();
+    result_512_9  = _mm512_setzero_ps();
+    result_512_10 = _mm512_setzero_ps();
+    result_512_11 = _mm512_setzero_ps();
+    result_512_12 = _mm512_setzero_ps();
+    result_512_13 = _mm512_setzero_ps();
+    result_512_14 = _mm512_setzero_ps();
+    result_512_15 = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Load 32 pair of BF16 elements from A (32 rows)
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+        arrayA_512_1 = _mm512_loadu_si512(A_addr + 32);
+
+        // Load 8 rows of B
+        _MM512_BROADCASTD_EPI32(B_addr + 0,  arrayB_512_0);
+        _MM512_BROADCASTD_EPI32(B_addr + 2,  arrayB_512_1);
+        _MM512_BROADCASTD_EPI32(B_addr + 4,  arrayB_512_2);
+        _MM512_BROADCASTD_EPI32(B_addr + 6,  arrayB_512_3);
+        _MM512_BROADCASTD_EPI32(B_addr + 8,  arrayB_512_4);
+        _MM512_BROADCASTD_EPI32(B_addr + 10, arrayB_512_5);
+        _MM512_BROADCASTD_EPI32(B_addr + 12, arrayB_512_6);
+        _MM512_BROADCASTD_EPI32(B_addr + 14, arrayB_512_7);
+
+        result_512_0  = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_0);
+        result_512_1  = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_1);
+        result_512_2  = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_2);
+        result_512_3  = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_3);
+        result_512_4  = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_4);
+        result_512_5  = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_5);
+        result_512_6  = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_6);
+        result_512_7  = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_7);
+
+        result_512_8  = _mm512_dpbf16_ps(result_512_8,  (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_0);
+        result_512_9  = _mm512_dpbf16_ps(result_512_9,  (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_1);
+        result_512_10 = _mm512_dpbf16_ps(result_512_10, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_2);
+        result_512_11 = _mm512_dpbf16_ps(result_512_11, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_3);
+        result_512_12 = _mm512_dpbf16_ps(result_512_12, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_4);
+        result_512_13 = _mm512_dpbf16_ps(result_512_13, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_5);
+        result_512_14 = _mm512_dpbf16_ps(result_512_14, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_6);
+        result_512_15 = _mm512_dpbf16_ps(result_512_15, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_7);
+
+        // Load B with unroll 8
+        B_addr += 16;
+        // Load A with unroll 64
+        A_addr += 64;
+    }
+
+    if (m != 32) {
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (32-m));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        STORE16_COMPLETE_RESULT(result_512_0,  (C_addr))
+        STORE16_MASK_COMPLETE_RESULT(result_512_8,  (C_addr + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_1,  (C_addr + ldc))
+        STORE16_MASK_COMPLETE_RESULT(result_512_9,  (C_addr + ldc + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_2,  (C_addr + ldc*2))
+        STORE16_MASK_COMPLETE_RESULT(result_512_10, (C_addr + ldc*2 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_3,  (C_addr + ldc*3))
+        STORE16_MASK_COMPLETE_RESULT(result_512_11, (C_addr + ldc*3 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_4,  (C_addr + ldc*4))
+        STORE16_MASK_COMPLETE_RESULT(result_512_12, (C_addr + ldc*4 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_5,  (C_addr + ldc*5))
+        STORE16_MASK_COMPLETE_RESULT(result_512_13, (C_addr + ldc*5 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_6,  (C_addr + ldc*6))
+        STORE16_MASK_COMPLETE_RESULT(result_512_14, (C_addr + ldc*6 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_7,  (C_addr + ldc*7))
+        STORE16_MASK_COMPLETE_RESULT(result_512_15, (C_addr + ldc*7 + 16), tail_mask)
+    } else {
+        STORE16_COMPLETE_RESULT(result_512_0,  (C_addr))
+        STORE16_COMPLETE_RESULT(result_512_8,  (C_addr + 16))
+        STORE16_COMPLETE_RESULT(result_512_1,  (C_addr + ldc))
+        STORE16_COMPLETE_RESULT(result_512_9,  (C_addr + ldc + 16))
+        STORE16_COMPLETE_RESULT(result_512_2,  (C_addr + ldc*2))
+        STORE16_COMPLETE_RESULT(result_512_10, (C_addr + ldc*2 + 16))
+        STORE16_COMPLETE_RESULT(result_512_3,  (C_addr + ldc*3))
+        STORE16_COMPLETE_RESULT(result_512_11, (C_addr + ldc*3 + 16))
+        STORE16_COMPLETE_RESULT(result_512_4,  (C_addr + ldc*4))
+        STORE16_COMPLETE_RESULT(result_512_12, (C_addr + ldc*4 + 16))
+        STORE16_COMPLETE_RESULT(result_512_5,  (C_addr + ldc*5))
+        STORE16_COMPLETE_RESULT(result_512_13, (C_addr + ldc*5 + 16))
+        STORE16_COMPLETE_RESULT(result_512_6,  (C_addr + ldc*6))
+        STORE16_COMPLETE_RESULT(result_512_14, (C_addr + ldc*6 + 16))
+        STORE16_COMPLETE_RESULT(result_512_7,  (C_addr + ldc*7))
+        STORE16_COMPLETE_RESULT(result_512_15, (C_addr + ldc*7 + 16))
+    }
+}
+
+// SBGEMM Kernel for M=16, N=8, K=Any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_tn_16x8xK_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_tn_16x8xK_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0;
+    __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7;
+    __m512  result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7;
+
+    result_512_0  = _mm512_setzero_ps();
+    result_512_1  = _mm512_setzero_ps();
+    result_512_2  = _mm512_setzero_ps();
+    result_512_3  = _mm512_setzero_ps();
+    result_512_4  = _mm512_setzero_ps();
+    result_512_5  = _mm512_setzero_ps();
+    result_512_6  = _mm512_setzero_ps();
+    result_512_7  = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Load 16 pair of BF16 elements from A (16 rows)
+        arrayA_512_0 = _mm512_loadu_si512(A_addr + 0);
+
+        // Load 8 rows of B
+        _MM512_BROADCASTD_EPI32(B_addr + 0,  arrayB_512_0);
+        _MM512_BROADCASTD_EPI32(B_addr + 2,  arrayB_512_1);
+        _MM512_BROADCASTD_EPI32(B_addr + 4,  arrayB_512_2);
+        _MM512_BROADCASTD_EPI32(B_addr + 6,  arrayB_512_3);
+        _MM512_BROADCASTD_EPI32(B_addr + 8,  arrayB_512_4);
+        _MM512_BROADCASTD_EPI32(B_addr + 10, arrayB_512_5);
+        _MM512_BROADCASTD_EPI32(B_addr + 12, arrayB_512_6);
+        _MM512_BROADCASTD_EPI32(B_addr + 14, arrayB_512_7);
+
+        result_512_0  = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_0);
+        result_512_1  = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_1);
+        result_512_2  = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_2);
+        result_512_3  = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_3);
+        result_512_4  = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_4);
+        result_512_5  = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_5);
+        result_512_6  = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_6);
+        result_512_7  = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_7);
+
+        // Load B with unroll 8
+        B_addr += 16;
+        // Load A with unroll 32
+        A_addr += 32;
+    }
+
+    if (m != 16) {
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        STORE16_MASK_COMPLETE_RESULT(result_512_0,  (C_addr), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_1,  (C_addr + ldc), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_2,  (C_addr + ldc*2), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_3,  (C_addr + ldc*3), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_4,  (C_addr + ldc*4), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_5,  (C_addr + ldc*5), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_6,  (C_addr + ldc*6), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_7,  (C_addr + ldc*7), tail_mask)
+    } else {
+        STORE16_COMPLETE_RESULT(result_512_0,  (C_addr))
+        STORE16_COMPLETE_RESULT(result_512_1,  (C_addr + ldc))
+        STORE16_COMPLETE_RESULT(result_512_2,  (C_addr + ldc*2))
+        STORE16_COMPLETE_RESULT(result_512_3,  (C_addr + ldc*3))
+        STORE16_COMPLETE_RESULT(result_512_4,  (C_addr + ldc*4))
+        STORE16_COMPLETE_RESULT(result_512_5,  (C_addr + ldc*5))
+        STORE16_COMPLETE_RESULT(result_512_6,  (C_addr + ldc*6))
+        STORE16_COMPLETE_RESULT(result_512_7,  (C_addr + ldc*7))
+    }
+}
+
+// SBGEMM Kernel for 16<M<=32, N<8, K=Any number but will be processed based on 32
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_tn_32xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_tn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+    int  SHUFFLE_MAGIC_NO = 0x39;
+    BLASLONG tag_k_32x = k & (~31);
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512[2];
+    __m512i arrayB_512[8];
+    __m512  result_512[16];
+
+    for (int i = 0; i < 15; i++) {
+        result_512[i] = _mm512_setzero_ps();
+    }
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        // Load B with unroll n
+        for (int i = 0; i < n; i ++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        for (BLASLONG idx = 0; idx < 32;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            arrayA_512[0] = _mm512_loadu_si512(A_addr);
+            arrayA_512[1] = _mm512_loadu_si512(A_addr + 32);
+            A_addr += 64;
+
+            for (int i = 0; i < n; i++) {
+                result_512[i]   = _mm512_dpbf16_ps(result_512[i]  , (__m512bh) arrayA_512[0], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8], (__m512bh) arrayA_512[1], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i]   = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
+        }
+    }
+
+    if (tag_k_32x != k) {
+            // Load B with unroll n
+        for (int i = 0; i < n; i ++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        BLASLONG width = k - tag_k_32x;
+        for (BLASLONG idx = 0; idx < width;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            arrayA_512[0] = _mm512_loadu_si512(A_addr);
+            arrayA_512[1] = _mm512_loadu_si512(A_addr + 32);
+            A_addr += 64;
+
+            for (int i = 0; i < n; i++) {
+                result_512[i]   = _mm512_dpbf16_ps(result_512[i]  , (__m512bh) arrayA_512[0], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8], (__m512bh) arrayA_512[1], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i]   = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
+        }
+    }
+
+    if (m != 32) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (32-m));
+        for (int i = 0; i < n; i++) {
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+            STORE16_MASK_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i++) {
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+            STORE16_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16))
+        }
+    }
+}
+
+// SBGEMM Kernel for M<=16, N<8, K=Any number but will be processed based on 32
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_tn_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_tn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+    int  SHUFFLE_MAGIC_NO = 0x39;
+    BLASLONG tag_k_32x = k & (~31);
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512;
+    __m512i arrayB_512[8];
+    __m512  result_512[8];
+
+    for (int i = 0; i < 8; i++) {
+        result_512[i]    = _mm512_setzero_ps();
+    }
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        // Load B with unroll n
+        for (int i = 0; i < n; i ++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        for (BLASLONG idx = 0; idx < 32;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            arrayA_512 = _mm512_loadu_si512(A_addr);
+            A_addr += 32;
+
+            for (int i = 0; i < n; i++) {
+                result_512[i]   = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i]   = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
+        }
+    }
+
+    if (tag_k_32x != k) {
+        // Load B with unroll n
+        for (int i = 0; i < n; i ++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        BLASLONG width = k - tag_k_32x;
+        for (BLASLONG idx = 0; idx < width;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            arrayA_512 = _mm512_loadu_si512(A_addr);
+            A_addr += 32;
+
+            for (int i = 0; i < n; i++) {
+                result_512[i]   = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i]   = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
+        }
+    }
+
+    if (m != 16) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m));
+        for (int i = 0; i < n; i++) {
+            STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i++) {
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+        }
+    }
+}
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_blocking_kernel_tn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#else                  // ALPHA is ONE
+void sbgemm_blocking_kernel_tn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#endif
+{
+    BLASLONG m_step, n_step, k_step, k_step_round32;
+    BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1));
+
+    BLASLONG n_from, n_to;
+    BLASLONG tag_n_Nx;
+
+    n_from = 0;
+    n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N;
+    tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+
+    k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K;
+    k_step_round32 = k_step & (~31);
+    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+
+    if (M >= BF16_BLOCK_THRES_M) {
+        while (n_from < N) {
+            for (BLASLONG idx_k = 0; idx_k < K;) {
+                // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(0, idx_k), lda, block_A);
+                for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                    // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                    COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_TN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); // TODO how to process m
+                }
+
+                if (tag_n_Nx != n_to) {
+                    n_step = n_to - tag_n_Nx;
+                    COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_TN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                }
+
+                for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) {
+                    COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(idx_m, idx_k), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        SBGEMM_BLOCK_KERNEL_TN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        SBGEMM_BLOCK_KERNEL_TN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc);
+                    }
+                }
+
+                if (tag_m_Nx != M) {
+                    m_step = M - tag_m_Nx;
+                    if (m_step > 16) {
+                        COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_TN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_TN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    } else {
+                        COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_TN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_TN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    }
+                }
+
+                idx_k += k_step;
+                k_step = K - idx_k;
+                k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                k_step_round32 = k_step & (~31);
+                k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+            }
+
+            n_from = n_to;
+            n_to += BF16_BLOCK_THRES_N;
+            n_to = (n_to > N) ? N : n_to;
+            tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+        }
+    } else {
+        m_step = M;
+        if (m_step > 16) {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(0, idx_k), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        } else {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(0, idx_k), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        }
+    }
+}
+/* ----------------------------------------- End of TN kernels --------------------------------------- */
+
+/* --------------------------------------------- TT kernels ------------------------------------------ */
+// SBGEMM Kernel for 16<M<=32, N<8, K can be any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_tt_32xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_tt_32xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0, arrayA_512_1;
+    __m512i arrayB_512[8];
+    __m512  result_512[16];
+
+    result_512[0]  = _mm512_setzero_ps();
+    result_512[1]  = _mm512_setzero_ps();
+    result_512[2]  = _mm512_setzero_ps();
+    result_512[3]  = _mm512_setzero_ps();
+    result_512[4]  = _mm512_setzero_ps();
+    result_512[5]  = _mm512_setzero_ps();
+    result_512[6]  = _mm512_setzero_ps();
+    result_512[7]  = _mm512_setzero_ps();
+    result_512[8]  = _mm512_setzero_ps();
+    result_512[9]  = _mm512_setzero_ps();
+    result_512[10] = _mm512_setzero_ps();
+    result_512[11] = _mm512_setzero_ps();
+    result_512[12] = _mm512_setzero_ps();
+    result_512[13] = _mm512_setzero_ps();
+    result_512[14] = _mm512_setzero_ps();
+    result_512[15] = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Each two rows are a group for 32-pair bf16 elements
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+        arrayA_512_1 = _mm512_loadu_si512(A_addr + 32);
+        A_addr += 64;
+
+        for (int i = 0; i < n; i ++) {
+            _MM512_BROADCASTD_EPI32(B_addr + i*2,  arrayB_512[i]);
+        }
+        B_addr += 16;
+
+        for (int i = 0; i < n; i ++) {
+            result_512[i] = _mm512_dpbf16_ps(result_512[i],  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]);
+            result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8],  (__m512bh) arrayA_512_1, (__m512bh) arrayB_512[i]);
+        }
+    }
+
+    if (m != 32) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (32-m));
+        for (int i = 0; i < n; i ++) {
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+            STORE16_MASK_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i ++) {
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+            STORE16_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16))
+        }
+    }
+}
+
+// SBGEMM Kernel for M<=16, N<8, K can be any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_tt_16xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_tt_16xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0;
+    __m512i arrayB_512[8];
+    __m512  result_512[8];
+
+    result_512[0]  = _mm512_setzero_ps();
+    result_512[1]  = _mm512_setzero_ps();
+    result_512[2]  = _mm512_setzero_ps();
+    result_512[3]  = _mm512_setzero_ps();
+    result_512[4]  = _mm512_setzero_ps();
+    result_512[5]  = _mm512_setzero_ps();
+    result_512[6]  = _mm512_setzero_ps();
+    result_512[7]  = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Each two rows are a group for 16-pair bf16 elements
+        // Load two rows into a 512 register
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+        A_addr += 32;
+
+        for (int i = 0; i < n; i ++) {
+            _MM512_BROADCASTD_EPI32(B_addr + i*2,  arrayB_512[i]);
+        }
+        B_addr += 16;
+
+        for (int i = 0; i < n; i ++) {
+            result_512[i] = _mm512_dpbf16_ps(result_512[i],  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]);
+        }
+    }
+
+    if (m != 16) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m));
+        for (int i = 0; i < n; i++) {
+            STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i++) {
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+        }
+    }
+}
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_blocking_kernel_tt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#else                  // ALPHA is ONE
+void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#endif
+{
+    BLASLONG m_step, n_step, k_step, k_step_round32;
+    BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1));
+
+    BLASLONG n_from, n_to;
+    BLASLONG tag_n_Nx;
+
+    n_from = 0;
+    n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N;
+    tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+
+    k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K;
+    k_step_round32 = k_step & (~31);
+    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+
+    if (M >= BF16_BLOCK_THRES_M) {
+        while (n_from < N) {
+            for (BLASLONG idx_k = 0; idx_k < K;) {
+                // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(0, idx_k), lda, block_A);
+                for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                    // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                    COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_TT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                }
+
+                if (tag_n_Nx != n_to) {
+                    n_step = n_to - tag_n_Nx;
+                    COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_TT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                }
+
+                for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) {
+                    COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(idx_m, idx_k), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        SBGEMM_BLOCK_KERNEL_TT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        SBGEMM_BLOCK_KERNEL_TT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc);
+                    }
+                }
+
+                if (tag_m_Nx != M) {
+                    m_step = M - tag_m_Nx;
+                    if (m_step > 16) {
+                        COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_TT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_TT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    } else {
+                        COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_TT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_TT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    }
+                }
+
+                idx_k += k_step;
+                k_step = K - idx_k;
+                k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                k_step_round32 = k_step & (~31);
+                k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+            }
+
+            n_from = n_to;
+            n_to += BF16_BLOCK_THRES_N;
+            n_to = (n_to > N) ? N : n_to;
+            tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+        }
+    } else {
+        m_step = M;
+        if (m_step > 16) {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(0, idx_k), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        } else {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(0, idx_k), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        }
+    }
+}
+/* ----------------------------------------- End of TT kernels --------------------------------------- */
 
 #ifndef ONE_ALPHA      // ALPHA is not ONE
 void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
@@ -613,13 +1806,33 @@ void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_
 		 OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc)
 #endif
 {
-    bfloat16 block_A[BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M];
-    bfloat16 block_B[BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K];
-
-    // TODO: assume no trans for both A and B, to complement these scenarios later
     if (Order == CblasColMajor) {
-        SBGEMM_BLOCKING_KERNEL_2(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+        if (TransA == CblasNoTrans) {
+            if (TransB == CblasNoTrans) {
+                SBGEMM_BLOCKING_KERNEL_NN(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+            } else if (TransB == CblasTrans) {
+                SBGEMM_BLOCKING_KERNEL_NT(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+            }
+        } else {
+            if (TransB == CblasNoTrans) {
+                SBGEMM_BLOCKING_KERNEL_TN(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+            } else if (TransB == CblasTrans) {
+                SBGEMM_BLOCKING_KERNEL_TT(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+            }
+        }
     } else {
-        
+        if (TransA == CblasNoTrans) {
+            if (TransB == CblasNoTrans) {
+                SBGEMM_BLOCKING_KERNEL_NN(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B);
+            } else if (TransB == CblasTrans) {
+                SBGEMM_BLOCKING_KERNEL_TN(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B);
+            }
+        } else {
+            if (TransB == CblasNoTrans) {
+                SBGEMM_BLOCKING_KERNEL_NT(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B);
+            } else if (TransB == CblasTrans) {
+                SBGEMM_BLOCKING_KERNEL_TT(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B);
+            }
+        }
     }
-}
\ No newline at end of file
+}

From 44d0032f3b8e9794d51b7807b3fb53905a2e9f1c Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 5 Aug 2021 04:43:47 +0000
Subject: [PATCH 058/143] Small Matrix: skylakex: fix build error in old
 compiler

---
 kernel/x86_64/dgemm_small_kernel_nn_skylakex.c |  4 ++--
 kernel/x86_64/dgemm_small_kernel_nt_skylakex.c |  2 +-
 kernel/x86_64/dgemm_small_kernel_tn_skylakex.c |  4 ++--
 kernel/x86_64/dgemm_small_kernel_tt_skylakex.c | 10 +++++-----
 kernel/x86_64/sgemm_small_kernel_nt_skylakex.c |  2 +-
 kernel/x86_64/sgemm_small_kernel_tt_skylakex.c |  6 +++---
 6 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
index ff2a04beb..d9b380fff 100644
--- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
@@ -372,8 +372,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
 			2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
 		};
-		__m512i idx_lo = _mm512_loadu_epi64(permute_table);
-		__m512i idx_hi = _mm512_loadu_epi64(permute_table + 8);
+		__m512i idx_lo = _mm512_loadu_si512(permute_table);
+		__m512i idx_hi = _mm512_loadu_si512(permute_table + 8);
 		for (; i < m4; i += 4, mi += 4) {
 			for (j = 0; j < n4; j += 4) {
 				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
diff --git a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c
index 0a95a68e2..e757197ba 100644
--- a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c
+++ b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c
@@ -385,7 +385,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 		for (int ii = 0; ii < 8; ii++) {
 			index_n[ii] = ii * ldc;
 		}
-		__m512i vindex_n = _mm512_loadu_epi64(index_n);
+		__m512i vindex_n = _mm512_loadu_si512(index_n);
 		for (; i < m4; i += 4) {
 			for (j = 0; j < n32; j += 32) {
 				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
index 0881f35b2..18c797283 100644
--- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
+++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
@@ -105,8 +105,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 		0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
 		2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
 	};
-	__m512i idx_lo = _mm512_loadu_epi64(permute_table);
-	__m512i idx_hi = _mm512_loadu_epi64(permute_table + 8);
+	__m512i idx_lo = _mm512_loadu_si512(permute_table);
+	__m512i idx_hi = _mm512_loadu_si512(permute_table + 8);
 
 	for (i = 0; i < m4; i += 4) {
 		for (j = 0; j < n4; j += 4) {
diff --git a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c
index 8ff79d2c8..00f42aa76 100644
--- a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c
+++ b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c
@@ -189,8 +189,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 		0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8,
 		2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8,
 	};
-	__m512i idx_lo = _mm512_loadu_epi64(permute_table);
-	__m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); 
+	__m512i idx_lo = _mm512_loadu_si512(permute_table);
+	__m512i idx_hi = _mm512_loadu_si512(permute_table + 8);
 
 	for (i = 0; i < m8; i += 8) {
 		for (j = 0; j < n16; j += 16) {
@@ -235,8 +235,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
 			2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
 		};
-		idx_lo = _mm512_loadu_epi64(permute_table2);
-		idx_hi = _mm512_loadu_epi64(permute_table2 + 8); 
+		idx_lo = _mm512_loadu_si512(permute_table2);
+		idx_hi = _mm512_loadu_si512(permute_table2 + 8);
 
 		for (j = 0; j < n32; j += 32) {
 			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
@@ -289,7 +289,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 		for (int ii = 0; ii < 8; ii++) {
 			index_n[ii] = ii * ldc;
 		}
-		__m512i vindex_n = _mm512_loadu_epi64(index_n);
+		__m512i vindex_n = _mm512_loadu_si512(index_n);
 #if !defined(B0)
 		__m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta));
 #endif
diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c
index f293bf9f9..a7d87f8c4 100644
--- a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c
@@ -385,7 +385,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 		for (int ii = 0; ii < 16; ii++) {
 			index_n[ii] = ii * ldc;
 		}
-		__m512i vindex_n = _mm512_loadu_epi32(index_n);
+		__m512i vindex_n = _mm512_loadu_si512(index_n);
 		for (; i < m4; i += 4) {
 			for (j = 0; j < n64; j += 64) {
 				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
diff --git a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c
index 8da560ef7..023f58746 100644
--- a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c
@@ -215,8 +215,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 		0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
 		0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
 	};
-	__m512i idx_lo = _mm512_loadu_epi32(permute_table);
-	__m512i idx_hi = _mm512_loadu_epi32(permute_table + 16); 
+	__m512i idx_lo = _mm512_loadu_si512(permute_table);
+	__m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
 	__mmask16 kc = 0xcccc;
 	__mmask16 k3 = 0x3333;
 	__mmask8 mask8 = 0xff;  // force use AVX128 instead of SSE
@@ -311,7 +311,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 		for (int ii = 0; ii < 16; ii++) {
 			index_n[ii] = ii * ldc;
 		}
-		__m512i vindex_n = _mm512_loadu_epi32(index_n);
+		__m512i vindex_n = _mm512_loadu_si512(index_n);
 #if !defined(B0)
 		__m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta));
 #endif

From c17d6dacb23f0862f6f0318c55c097c361132663 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 5 Aug 2021 05:46:13 +0000
Subject: [PATCH 059/143] Small Matrix: skip compile in unimplemented data type

---
 interface/gemm.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/interface/gemm.c b/interface/gemm.c
index 775f654c3..3497d8651 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -105,8 +105,13 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
 #endif
 };
 
-#ifndef GEMM3M
-#ifdef SMALL_MATRIX_OPT
+#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) && !defined(BFLOAT16)
+#define USE_SMALL_MATRIX_OPT 1
+#else
+#define USE_SMALL_MATRIX_OPT 0
+#endif
+
+#if USE_SMALL_MATRIX_OPT
 #ifndef DYNAMIC_ARCH
 #define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx]))
 #else
@@ -148,7 +153,6 @@ static size_t zgemm_small_kernel_b0[] = {
 #define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx))
 #endif
 #endif
-#endif
 
 #ifndef CBLAS
 
@@ -462,8 +466,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
   FUNCTION_PROFILE_START();
 
-#ifndef GEMM3M
-#ifdef SMALL_MATRIX_OPT
+#if USE_SMALL_MATRIX_OPT
 #if !defined(COMPLEX)
   if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){
 	  if(*(FLOAT *)(args.beta) == 0.0){
@@ -483,7 +486,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 	  return;
   }
 #endif
-#endif
 #endif
 
   buffer = (XFLOAT *)blas_memory_alloc(0);

From e5ba7c3235cd5ac9613e0989621c8d22294def5f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 5 Aug 2021 11:08:18 +0200
Subject: [PATCH 060/143] Disable all x86 jobs

---
 .travis.yml | 302 ++++++++++++++++++++++++++--------------------------
 1 file changed, 151 insertions(+), 151 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 2a221e3bd..8657b64f4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -55,38 +55,38 @@ matrix:
         - TARGET_BOX=IBMZ_LINUX
         - BTYPE="BINARY=64 USE_OPENMP=0 CC=clang"
 
-    - <<: *test-ubuntu
-      env:
-        - TARGET_BOX=LINUX64
-        - BTYPE="BINARY=64 USE_OPENMP=1"
-
-    - <<: *test-ubuntu
-      env:
-        - TARGET_BOX=LINUX64
-        - BTYPE="BINARY=64 INTERFACE64=1"
-
-    - <<: *test-ubuntu
-      compiler: clang
-      env:
-        - TARGET_BOX=LINUX64
-        - BTYPE="BINARY=64 CC=clang"
-
-    - <<: *test-ubuntu
-      compiler: clang
-      env:
-        - TARGET_BOX=LINUX64
-        - BTYPE="BINARY=64 INTERFACE64=1 CC=clang"
-
-    - <<: *test-ubuntu
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - gfortran-multilib
-      env:
-        - TARGET_BOX=LINUX32
-        - BTYPE="BINARY=32"
-
+#    - <<: *test-ubuntu
+#      env:
+#        - TARGET_BOX=LINUX64
+#        - BTYPE="BINARY=64 USE_OPENMP=1"
+#
+#    - <<: *test-ubuntu
+#      env:
+#        - TARGET_BOX=LINUX64
+#        - BTYPE="BINARY=64 INTERFACE64=1"
+#
+#    - <<: *test-ubuntu
+#      compiler: clang
+#      env:
+#        - TARGET_BOX=LINUX64
+#        - BTYPE="BINARY=64 CC=clang"
+#
+#    - <<: *test-ubuntu
+#      compiler: clang
+#      env:
+#        - TARGET_BOX=LINUX64
+#        - BTYPE="BINARY=64 INTERFACE64=1 CC=clang"
+#
+#    - <<: *test-ubuntu
+#      addons:
+#        apt:
+#          packages:
+#            - gcc-multilib
+#            - gfortran-multilib
+#      env:
+#        - TARGET_BOX=LINUX32
+#        - BTYPE="BINARY=32"
+#
     - os: linux
       arch: ppc64le
       dist: bionic
@@ -121,47 +121,47 @@ matrix:
         # for matrix annotation only
         - TARGET_BOX=PPC64LE_LINUX_P9
 
-    - os: linux
-      compiler: gcc
-      addons:
-        apt:
-          packages:
-            - binutils-mingw-w64-x86-64
-            - gcc-mingw-w64-x86-64
-            - gfortran-mingw-w64-x86-64
-      before_script: *common-before
-      script:
-        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
-      env:
-        - TARGET_BOX=WIN64
-        - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
-
+#    - os: linux
+#      compiler: gcc
+#      addons:
+#        apt:
+#          packages:
+#            - binutils-mingw-w64-x86-64
+#            - gcc-mingw-w64-x86-64
+#            - gfortran-mingw-w64-x86-64
+#      before_script: *common-before
+#      script:
+#        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+#      env:
+#        - TARGET_BOX=WIN64
+#        - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
+#
     # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc.
     # These jobs needs sudo, so Travis runs them on VM-based infrastructure
     # which is slower than container-based infrastructure used for jobs
     # that don't require sudo.
-    - &test-alpine
-      os: linux
-      dist: trusty
-      sudo: true
-      language: minimal
-      before_install:
-        - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
-          && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251  alpine-chroot-install' | sha1sum -c || exit 1"
-        - alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
-      install:
-        - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
-      before_script: *common-before
-      script:
-        # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
-        - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
-              CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
-        - alpine make -C test $COMMON_FLAGS $BTYPE
-        - alpine make -C ctest $COMMON_FLAGS $BTYPE
-        - alpine make -C utest $COMMON_FLAGS $BTYPE
-      env:
-        - TARGET_BOX=LINUX64_MUSL
-        - BTYPE="BINARY=64"
+ #   - &test-alpine
+ #     os: linux
+ #     dist: trusty
+ #     sudo: true
+ #     language: minimal
+ #     before_install:
+ #       - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
+ #         && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251  alpine-chroot-install' | sha1sum -c || exit 1"
+ #       - alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
+ #     install:
+ #       - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
+ #     before_script: *common-before
+ #     script:
+ #       # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
+ #       - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+ #             CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
+ #       - alpine make -C test $COMMON_FLAGS $BTYPE
+ #       - alpine make -C ctest $COMMON_FLAGS $BTYPE
+ #       - alpine make -C utest $COMMON_FLAGS $BTYPE
+ #     env:
+ #       - TARGET_BOX=LINUX64_MUSL
+ #       - BTYPE="BINARY=64"
 
     # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS,
     # but only on Travis CI, cannot reproduce it elsewhere.
@@ -171,98 +171,98 @@ matrix:
     #    - TARGET_BOX=LINUX64_MUSL
     #    - BTYPE="BINARY=64 USE_OPENMP=1"
 
-    - <<: *test-alpine
-      env:
-        - TARGET_BOX=LINUX64_MUSL
-        - BTYPE="BINARY=64 INTERFACE64=1"
+#    - <<: *test-alpine
+#      env:
+#        - TARGET_BOX=LINUX64_MUSL
+#        - BTYPE="BINARY=64 INTERFACE64=1"
+#
+#    # Build with the same flags as Alpine do in OpenBLAS package.
+#    - <<: *test-alpine
+#      env:
+#        - TARGET_BOX=LINUX64_MUSL
+#        - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
 
-    # Build with the same flags as Alpine do in OpenBLAS package.
-    - <<: *test-alpine
-      env:
-        - TARGET_BOX=LINUX64_MUSL
-        - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
+#    - &test-cmake
+#      os: linux
+#      compiler: clang
+#      addons:
+#        apt:
+#          packages:
+#            - gfortran
+#            - cmake
+#      dist: trusty
+#      sudo: true
+#      before_script:
+#        - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
+#      script:
+#        - mkdir build
+#        - CONFIG=Release
+#        - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
+#        - cmake --build build --config $CONFIG -- -j2
+#      env:
+#        - CMAKE=1
+#    - <<: *test-cmake
+#      env:
+#        - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1"
+#    - <<: *test-cmake
+#      compiler: gcc
+#      env:
+#        - CMAKE=1
 
-    - &test-cmake
-      os: linux
-      compiler: clang
-      addons:
-        apt:
-          packages:
-            - gfortran
-            - cmake
-      dist: trusty
-      sudo: true
-      before_script:
-        - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
-      script:
-        - mkdir build
-        - CONFIG=Release
-        - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
-        - cmake --build build --config $CONFIG -- -j2
-      env:
-        - CMAKE=1
-    - <<: *test-cmake
-      env:
-        - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1"
-    - <<: *test-cmake
-      compiler: gcc
-      env:
-        - CMAKE=1
-
-    - &test-macos
-      os: osx
-      osx_image: xcode11.5
-      before_script:
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
-      script:
-        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
-      env:
-        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
-
-    - <<: *test-macos
-      osx_image: xcode12
-      before_script:
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
-        - brew update
-      script:
-        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
-      env:
-        - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
-
-    - <<: *test-macos
-      osx_image: xcode12
-      before_script:
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
-        - brew update
-      script:
-        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
-      env:
-        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"      
+#    - &test-macos
+#      os: osx
+#      osx_image: xcode11.5
+#      before_script:
+#        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+#      script:
+#        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+#      env:
+#        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
+#
+#    - <<: *test-macos
+#      osx_image: xcode12
+#      before_script:
+#        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+#        - brew update
+#      script:
+#        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+#      env:
+#        - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
+#
+#    - <<: *test-macos
+#      osx_image: xcode12
+#      before_script:
+#        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+#        - brew update
+#      script:
+#        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+#      env:
+#        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"      
 
   #  - <<: *test-macos
   #    osx_image: xcode10
   #    env:
   #      - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
 
-    - <<: *test-macos
-      osx_image: xcode11.5
-      before_script:
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
-        - brew update
-      env:
+#    - <<: *test-macos
+#      osx_image: xcode11.5
+#      before_script:
+#        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+#        - brew update
+#      env:
 #        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
 #        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
-        - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
-        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
-        - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
-    - <<: *test-macos
-      osx_image: xcode11.5
-      env:
-#        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
-#        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
-        - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
-        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
-        - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
+#        - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+#        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
+#        - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
+#    - <<: *test-macos
+#      osx_image: xcode11.5
+#      env:
+##        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+##        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
+#        - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+#        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
+#        - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
 
     - &test-graviton2
       os: linux

From b06880c2cdfc8a0bd5caa2c1d62f7bba3611b932 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Tue, 10 Aug 2021 22:06:04 -0500
Subject: [PATCH 061/143] POWER10: Improving dasum performance

Unrolling a loop in dasum micro code to help in improving
POWER10 performance.
---
 kernel/power/dasum.c                |   4 +-
 kernel/power/dasum_microk_power10.c | 120 ++++++++++++++++++++++++----
 2 files changed, 106 insertions(+), 18 deletions(-)

diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c
index 7507621cf..35390dd24 100644
--- a/kernel/power/dasum.c
+++ b/kernel/power/dasum.c
@@ -115,14 +115,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	{
 
 #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
-		if ( n >= 16 )
+		if ( n >= 32)
 		{
 			BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
 			for (i = 0; i < align; i++) {
 				sumf += ABS(x[i]);
 			}
 		}
-		n1 = (n-i) & -16;
+		n1 = (n-i) & -32;
 		if ( n1 > 0 )
 		{
 			sumf += dasum_kernel_16(n1, &x[i]);
diff --git a/kernel/power/dasum_microk_power10.c b/kernel/power/dasum_microk_power10.c
index d1a21b4d1..110627fa4 100644
--- a/kernel/power/dasum_microk_power10.c
+++ b/kernel/power/dasum_microk_power10.c
@@ -34,6 +34,19 @@ static double dasum_kernel_16 (long n, double *x)
   __vector double t1;
   __vector double t2;
   __vector double t3;
+  __vector double t4;
+  __vector double t5;
+  __vector double t6;
+  __vector double t7;
+  __vector double a0;
+  __vector double a1;
+  __vector double a2;
+  __vector double a3;
+  __vector double a4;
+  __vector double a5;
+  __vector double a6;
+  __vector double a7;
+
 
   __asm__
     (
@@ -48,14 +61,27 @@ static double dasum_kernel_16 (long n, double *x)
        "xxlxor		38, 38,	38	\n\t"
        "xxlxor		39, 39,	39	\n\t"
 
+       "xxlxor		%x11, %x11, %x11	\n\t"
+       "xxlxor		%x12, %x12, %x12	\n\t"
+       "xxlxor		%x13, %x13, %x13	\n\t"
+       "xxlxor		%x14, %x14, %x14 	\n\t"
+       "xxlxor		%x15, %x15, %x15 	\n\t"
+       "xxlxor		%x16, %x16, %x16	\n\t"
+       "xxlxor		%x17, %x17, %x17	\n\t"
+       "xxlxor		%x18, %x18, %x18	\n\t"
+
        "lxvp            40, 0(%2)       \n\t"
        "lxvp            42, 32(%2)      \n\t"
        "lxvp            44, 64(%2)      \n\t"
        "lxvp            46, 96(%2)      \n\t"
+       "lxvp            52, 128(%2)	\n\t"
+       "lxvp            54, 160(%2)	\n\t"
+       "lxvp            56, 192(%2)	\n\t"
+       "lxvp            58, 224(%2)	\n\t"
 
-       "addi		%2, %2, 128	\n\t"
+       "addi		%2, %2, 256	\n\t"
 
-       "addic.		%1, %1, -16	\n\t"
+       "addic.		%1, %1, -32	\n\t"
        "ble		two%=		\n\t"
 
        ".align	5		\n"
@@ -65,33 +91,52 @@ static double dasum_kernel_16 (long n, double *x)
        "xvabsdp		49, 41		\n\t"
        "xvabsdp		50, 42		\n\t"
        "xvabsdp		51, 43		\n\t"
-       "lxvp            40, 0(%2)       \n\t"
-
 
        "xvabsdp		%x3, 44		\n\t"
        "xvabsdp		%x4, 45		\n\t"
-       "lxvp            42, 32(%2)      \n\t"
-
-
        "xvabsdp		%x5, 46		\n\t"
        "xvabsdp		%x6, 47		\n\t"
-       "lxvp            44, 64(%2)      \n\t"
-
 
        "xvadddp		32, 32, 48	\n\t"
        "xvadddp		33, 33, 49	\n\t"
-
-       "lxvp            46, 96(%2)      \n\t"
-
        "xvadddp		34, 34, 50	\n\t"
        "xvadddp		35, 35, 51	\n\t"
-       "addi		%2, %2, 128	\n\t"
+       "lxvp            40, 0(%2)       \n\t"
+       "lxvp            42, 32(%2)      \n\t"
+       "lxvp            44, 64(%2)      \n\t"
+       "lxvp            46, 96(%2)      \n\t"
+
        "xvadddp		36, 36, %x3	\n\t"
        "xvadddp		37, 37, %x4	\n\t"
-       "addic.		%1, %1, -16	\n\t"
        "xvadddp		38, 38, %x5	\n\t"
        "xvadddp		39, 39, %x6	\n\t"
 
+       "xvabsdp		60, 52 		\n\t"
+       "xvabsdp		61, 53 		\n\t"
+       "xvabsdp		62, 54 		\n\t"
+       "xvabsdp		63, 55 		\n\t"
+
+       "xvabsdp		%x7, 56		\n\t"
+       "xvabsdp		%x8, 57		\n\t"
+       "xvabsdp		%x9, 58		\n\t"
+       "xvabsdp		%x10, 59	\n\t"
+
+       "xvadddp		%x11, %x11, 60	\n\t"
+       "xvadddp		%x12, %x12, 61	\n\t"
+       "xvadddp		%x13, %x13, 62	\n\t"
+       "xvadddp		%x14, %x14, 63	\n\t"
+
+       "lxvp		52, 128(%2)	\n\t"
+       "lxvp		54, 160(%2)	\n\t"
+       "lxvp		56, 192(%2)	\n\t"
+       "lxvp		58, 224(%2)	\n\t"
+       "xvadddp		%x15, %x15, %x7	\n\t"
+       "xvadddp		%x16, %x16, %x8	\n\t"
+       "xvadddp		%x17, %x17, %x9	\n\t"
+       "xvadddp		%x18, %x18, %x10	\n\t"
+       "addi		%2, %2, 256	\n\t"
+       "addic.		%1, %1, -32	\n\t"
+
        "bgt		one%=		\n"
 
      "two%=:				\n\t"
@@ -114,6 +159,25 @@ static double dasum_kernel_16 (long n, double *x)
        "xvadddp		38, 38, %x5	\n\t"
        "xvadddp		39, 39, %x6	\n\t"
 
+       "xvabsdp		60, 52 		\n\t"
+       "xvabsdp		61, 53 		\n\t"
+       "xvabsdp		62, 54 		\n\t"
+       "xvabsdp		63, 55 		\n\t"
+
+       "xvabsdp		%x7, 56		\n\t"
+       "xvabsdp		%x8, 57		\n\t"
+       "xvabsdp		%x9, 58 	\n\t"
+       "xvabsdp		%x10, 59	\n\t"
+       "xvadddp		%x11, %x11, 60	\n\t"
+       "xvadddp		%x12, %x12, 61	\n\t"
+       "xvadddp		%x13, %x13, 62	\n\t"
+       "xvadddp		%x14, %x14, 63	\n\t"
+
+       "xvadddp		%x15, %x15, %x7	\n\t"
+       "xvadddp		%x16, %x16, %x8	\n\t"
+       "xvadddp		%x17, %x17, %x9	\n\t"
+       "xvadddp		%x18, %x18, %x10	\n\t"
+
        "xvadddp		32, 32, 33	\n\t"
        "xvadddp		34, 34, 35	\n\t"
        "xvadddp		36, 36, 37	\n\t"
@@ -122,7 +186,18 @@ static double dasum_kernel_16 (long n, double *x)
        "xvadddp		32, 32, 34	\n\t"
        "xvadddp		36, 36, 38	\n\t"
 
+       "xvadddp		%x11, %x11, %x12 	\n\t"
+       "xvadddp		%x13, %x13, %x14	\n\t"
+       "xvadddp		%x15, %x15, %x16	\n\t"
+       "xvadddp		%x17, %x17, %x18	\n\t"
+
+       "xvadddp		%x11, %x11, %x13	\n\t"
+       "xvadddp		%x15, %x15, %x17	\n\t"
+
+       "xvadddp		%x11, %x11, %x15	\n\t"
+
        "xvadddp		32, 32, 36	\n\t"
+       "xvadddp		32, 32, %x11	\n\t"
 
        XXSWAPD_S(33,32)
        "xsadddp		%x0, 32, 33	\n"
@@ -136,14 +211,27 @@ static double dasum_kernel_16 (long n, double *x)
        "=wa" (t0),	// 3
        "=wa" (t1),	// 4
        "=wa" (t2),	// 5
-       "=wa" (t3)	// 6
+       "=wa" (t3),	// 6
+       "=wa" (t4),	// 7
+       "=wa" (t5),	// 8
+       "=wa" (t6),	// 9
+       "=wa" (t7),	// 10
+       "=wa" (a0),	// 11
+       "=wa" (a1),	// 12
+       "=wa" (a2),	// 13
+       "=wa" (a3),	// 14
+       "=wa" (a4),	// 15
+       "=wa" (a5),	// 16
+       "=wa" (a6),	// 17
+       "=wa" (a7)	// 18
      :
        "m" (*x)
      :
        "cr0",
        "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
        "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
-       "vs48","vs49","vs50","vs51"
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
      );
 
   return sum;

From c28560129f65c212eba0093e99f4c9163856bffa Mon Sep 17 00:00:00 2001
From: cianciosa <cianciosamr@ornl.gov>
Date: Wed, 11 Aug 2021 12:00:07 -0400
Subject: [PATCH 062/143] Check the total number of arguments passed insead of
 if the ARGV# is defined. This fixes a problem when compling openblas as a
 subproject of another code.

---
 cmake/utils.cmake | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 6b54092ea..09bae7011 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -157,31 +157,31 @@ endfunction ()
 #                               STRING - compiles only the given type (e.g. DOUBLE)
 function(GenerateNamedObjects sources_in)
 
-  if (DEFINED ARGV1)
+  if (${ARGC} GREATER 1)
     set(defines_in ${ARGV1})
   endif ()
 
-  if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "")
+  if (${ARGC} GREATER 2 AND NOT "${ARGV2}" STREQUAL "")
     set(name_in ${ARGV2})
     # strip off extension for kernel files that pass in the object name.
     get_filename_component(name_in ${name_in} NAME_WE)
   endif ()
 
-  if (DEFINED ARGV3)
+  if (${ARGC} GREATER 3)
     set(use_cblas ${ARGV3})
   else ()
     set(use_cblas false)
   endif ()
 
-  if (DEFINED ARGV4)
+  if (${ARGC} GREATER 4)
     set(replace_last_with ${ARGV4})
   endif ()
 
-  if (DEFINED ARGV5)
+  if (${ARGC} GREATER 5)
     set(append_with ${ARGV5})
   endif ()
 
-  if (DEFINED ARGV6)
+  if ${ARGC} GREATER 6)
     set(no_float_type ${ARGV6})
   else ()
     set(no_float_type false)
@@ -196,7 +196,7 @@ function(GenerateNamedObjects sources_in)
   set(real_only false)
   set(complex_only false)
   set(mangle_complex_sources false)
-  if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "")
+  if (${ARGC} GREATER 7 AND NOT "${ARGV7}" STREQUAL "")
     if (${ARGV7} EQUAL 1)
       set(real_only true)
     elseif (${ARGV7} EQUAL 2)
@@ -342,17 +342,17 @@ endfunction ()
 function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme)
 
   set(alternate_name_in "")
-  if (DEFINED ARGV5)
+  if (${ARGC} GREATER 5)
     set(alternate_name_in ${ARGV5})
   endif ()
 
   set(no_float_type false)
-  if (DEFINED ARGV6)
+  if (${ARGC} GREATER 6)
     set(no_float_type ${ARGV6})
   endif ()
 
   set(complex_filename_scheme "")
-  if (DEFINED ARGV7)
+  if (${ARGC} GREATER 7)
     set(complex_filename_scheme ${ARGV7})
   endif ()
 

From 4c766cd11fa3f27ed1b572225ab2e937e43a2bab Mon Sep 17 00:00:00 2001
From: cianciosa <cianciosamr@ornl.gov>
Date: Wed, 11 Aug 2021 12:08:34 -0400
Subject: [PATCH 063/143] Fix a small syntax error. A ( was accidently deleted.

---
 cmake/utils.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 09bae7011..01b489f2a 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -181,7 +181,7 @@ function(GenerateNamedObjects sources_in)
     set(append_with ${ARGV5})
   endif ()
 
-  if ${ARGC} GREATER 6)
+  if (${ARGC} GREATER 6)
     set(no_float_type ${ARGV6})
   else ()
     set(no_float_type false)

From a7bc8ec1f107a95a18cfcdbd5c47721abfa75cb9 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei@loongson.cn>
Date: Tue, 10 Aug 2021 16:42:57 +0800
Subject: [PATCH 064/143] Delete the macro instruction "li" and use "li.d"
 instead

Change-Id: Icff7981e2eb7df29ba5af1f8eb5be8443c67450f
---
 kernel/loongarch64/asum.S    |  2 +-
 kernel/loongarch64/cnrm2.S   |  2 +-
 kernel/loongarch64/copy.S    |  2 +-
 kernel/loongarch64/dot.S     |  2 +-
 kernel/loongarch64/gemv_n.S  |  4 ++--
 kernel/loongarch64/gemv_t.S  |  2 +-
 kernel/loongarch64/iamax.S   | 12 ++++++------
 kernel/loongarch64/iamin.S   | 12 ++++++------
 kernel/loongarch64/izamax.S  | 12 ++++++------
 kernel/loongarch64/izamin.S  | 12 ++++++------
 kernel/loongarch64/scal.S    |  2 +-
 kernel/loongarch64/snrm2.S   |  2 +-
 kernel/loongarch64/swap.S    |  2 +-
 kernel/loongarch64/zcopy.S   |  2 +-
 kernel/loongarch64/zdot.S    |  2 +-
 kernel/loongarch64/zgemv_n.S |  4 ++--
 kernel/loongarch64/zgemv_t.S |  2 +-
 kernel/loongarch64/zscal.S   |  2 +-
 18 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/kernel/loongarch64/asum.S b/kernel/loongarch64/asum.S
index e4c717085..7d21ce038 100644
--- a/kernel/loongarch64/asum.S
+++ b/kernel/loongarch64/asum.S
@@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    MTC  s1, $r0
    MTC  s2, $r0
    slli.d  INCX, INCX, BASE_SHIFT
-   li TEMP, SIZE
+   li.d TEMP, SIZE
    bge $r0,    N, .L999
    srai.d  I, N, 3
    bne INCX, TEMP, .L20
diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S
index c4b2555d3..9d27987e1 100644
--- a/kernel/loongarch64/cnrm2.S
+++ b/kernel/loongarch64/cnrm2.S
@@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
    movgr2fr.d  s1,  $r0
-   li  TEMP, 2 * SIZE
+   li.d  TEMP, 2 * SIZE
    fmov.d s2, s1
    bge $r0,    N, .L999
    slli.d INCX, INCX, ZBASE_SHIFT
diff --git a/kernel/loongarch64/copy.S b/kernel/loongarch64/copy.S
index 28b7bce4c..3156f60b8 100644
--- a/kernel/loongarch64/copy.S
+++ b/kernel/loongarch64/copy.S
@@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    LDINT   INCY,  0(INCY)
 #endif
 
-   li  TEMP, SIZE
+   li.d  TEMP, SIZE
    NOP
    slli.d INCX, INCX, BASE_SHIFT
    bge $r0,    N, .L999
diff --git a/kernel/loongarch64/dot.S b/kernel/loongarch64/dot.S
index 4fcd569c8..1e4c81a02 100644
--- a/kernel/loongarch64/dot.S
+++ b/kernel/loongarch64/dot.S
@@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    MTC  s1, $r0
    MTC  s2, $r0
    slli.d  INCX, INCX, BASE_SHIFT
-   li  TEMP, SIZE
+   li.d  TEMP, SIZE
    slli.d INCY, INCY, BASE_SHIFT
    bge $r0,    N, .L999
    srai.d I, N, 3
diff --git a/kernel/loongarch64/gemv_n.S b/kernel/loongarch64/gemv_n.S
index 334a2991f..9ab43ae19 100644
--- a/kernel/loongarch64/gemv_n.S
+++ b/kernel/loongarch64/gemv_n.S
@@ -91,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    bge $r0,    M, .L999
    slli.d INCY, INCY, BASE_SHIFT
    bge $r0,    N, .L999
-   li  I, SIZE
+   li.d  I, SIZE
    move   YORIG, Y
    beq INCY, I, .L10
    srai.d  I,  M, 2
@@ -472,7 +472,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    .align 3
 
 .L900:
-   li  YORIG, SIZE
+   li.d  YORIG, SIZE
    srai.d I,  M, 2
    beq INCY, YORIG, .L999
    move   XX, BUFFER
diff --git a/kernel/loongarch64/gemv_t.S b/kernel/loongarch64/gemv_t.S
index 19333ed4a..af4232769 100644
--- a/kernel/loongarch64/gemv_t.S
+++ b/kernel/loongarch64/gemv_t.S
@@ -88,7 +88,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    bge $r0,    M, .L999
    slli.d INCY, INCY, BASE_SHIFT
    bge $r0,    N, .L999
-   li  I, SIZE
+   li.d  I, SIZE
    move   XORIG, X
    beq INCX, I, .L10
    srai.d  I,  M, 2
diff --git a/kernel/loongarch64/iamax.S b/kernel/loongarch64/iamax.S
index 0f9e1bc59..31b1a9e57 100644
--- a/kernel/loongarch64/iamax.S
+++ b/kernel/loongarch64/iamax.S
@@ -62,24 +62,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    LDINT   INCX,  0(INCX)
 #endif
 
-   li x1, 0
+   li.d x1, 0
    bge $r0,    N, .L999
    slli.d INCX, INCX, BASE_SHIFT
    bge $r0,    INCX, .L999
    LD a1,  X,   0 * SIZE
    addi.d  N, N, -1
-   li x1, 1
+   li.d x1, 1
    bge $r0,    N, .L999
    FABS    s1, a1
    add.d   X, X, INCX
    FABS    s2, a1
-   li  x2, 1
+   li.d  x2, 1
    FABS    s3, a1
    srai.d  I, N, 3
    FABS    s4, a1
-   li  x3, 1
-   li  TEMP, 2
-   li x4, 1
+   li.d  x3, 1
+   li.d  TEMP, 2
+   li.d x4, 1
    bge $r0,    I, .L15
    LD a1,  X,   0 * SIZE
    add.d   X, X, INCX
diff --git a/kernel/loongarch64/iamin.S b/kernel/loongarch64/iamin.S
index 7751a9d03..9364b9725 100644
--- a/kernel/loongarch64/iamin.S
+++ b/kernel/loongarch64/iamin.S
@@ -62,24 +62,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    LDINT   INCX,  0(INCX)
 #endif
 
-   li x1, 0
+   li.d x1, 0
    bge $r0,    N, .L999
    slli.d INCX, INCX, BASE_SHIFT
    bge $r0,    INCX, .L999
    LD a1,  X,   0 * SIZE
    addi.d  N, N, -1
-   li x1, 1
+   li.d x1, 1
    bge $r0,    N, .L999
    FABS    s1, a1
    add.d   X, X, INCX
    FABS    s2, a1
-   li  x2, 1
+   li.d  x2, 1
    FABS    s3, a1
    srai.d  I, N, 3
    FABS    s4, a1
-   li  x3, 1
-   li  TEMP, 2
-   li x4, 1
+   li.d  x3, 1
+   li.d  TEMP, 2
+   li.d x4, 1
    bge $r0,    I, .L15
    LD a1,  X,   0 * SIZE
    add.d   X, X, INCX
diff --git a/kernel/loongarch64/izamax.S b/kernel/loongarch64/izamax.S
index 6d7cb9e30..8d3ae529e 100644
--- a/kernel/loongarch64/izamax.S
+++ b/kernel/loongarch64/izamax.S
@@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       LDINT   INCX,  0(INCX)
 #endif
 
-   li x1, 0
+   li.d x1, 0
    bge $r0,    N, .L999
    slli.d INCX, INCX, ZBASE_SHIFT
    bge $r0,    INCX, .L999
@@ -79,14 +79,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    ADD s3, t1, t2
    ADD s4, t1, t2
    addi.d  N, N, -1
-   li x1, 1
+   li.d x1, 1
    bge $r0,    N, .L999
    add.d   X, X, INCX
-   li  x2, 1
+   li.d  x2, 1
    srai.d  I, N, 2
-   li  x3, 1
-   li  TEMP, 2
-   li x4, 1
+   li.d  x3, 1
+   li.d  TEMP, 2
+   li.d x4, 1
    bge $r0,    I, .L15
    LD a1,  X,   0 * SIZE
    LD a2,  X,   1 * SIZE
diff --git a/kernel/loongarch64/izamin.S b/kernel/loongarch64/izamin.S
index 998927985..38a109c21 100644
--- a/kernel/loongarch64/izamin.S
+++ b/kernel/loongarch64/izamin.S
@@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    LDINT   INCX,  0(INCX)
 #endif
 
-   li x1, 0
+   li.d x1, 0
    bge $r0,    N, .L999
    slli.d INCX, INCX, ZBASE_SHIFT
    bge $r0,    INCX, .L999
@@ -79,14 +79,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    ADD s3, t1, t2
    ADD s4, t1, t2
    addi.d  N, N, -1
-   li x1, 1
+   li.d x1, 1
    bge $r0,    N, .L999
    add.d   X, X, INCX
-   li  x2, 1
+   li.d  x2, 1
    srai.d  I, N, 2
-   li  x3, 1
-   li  TEMP, 2
-   li x4, 1
+   li.d  x3, 1
+   li.d  TEMP, 2
+   li.d x4, 1
    bge $r0,    I, .L15
    LD a1,  X,   0 * SIZE
    LD a2,  X,   1 * SIZE
diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S
index 7399e57b3..566bce6cb 100644
--- a/kernel/loongarch64/scal.S
+++ b/kernel/loongarch64/scal.S
@@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
    PROLOGUE
 
-   li  TEMP, SIZE
+   li.d  TEMP, SIZE
    MTC  a1, $r0
    slli.d INCX, INCX, BASE_SHIFT
    bge $r0,    N, .L999
diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S
index 14b62cfe7..57c21a017 100644
--- a/kernel/loongarch64/snrm2.S
+++ b/kernel/loongarch64/snrm2.S
@@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
    movgr2fr.d  s1, $r0
-   li  TEMP, SIZE
+   li.d  TEMP, SIZE
    fmov.d s2, s1
    bge $r0,    N, .L999
    slli.d INCX, INCX, BASE_SHIFT
diff --git a/kernel/loongarch64/swap.S b/kernel/loongarch64/swap.S
index c9d8f7fc1..4578a8d54 100644
--- a/kernel/loongarch64/swap.S
+++ b/kernel/loongarch64/swap.S
@@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
    PROLOGUE
 
-   li  TEMP, SIZE
+   li.d  TEMP, SIZE
    slli.d INCX, INCX, BASE_SHIFT
    bge $r0,    N, .L999
    slli.d INCY, INCY, BASE_SHIFT
diff --git a/kernel/loongarch64/zcopy.S b/kernel/loongarch64/zcopy.S
index 3fbe56074..0f480ca85 100644
--- a/kernel/loongarch64/zcopy.S
+++ b/kernel/loongarch64/zcopy.S
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    LDINT   INCY,  0(INCY)
 #endif
 
-   li  TEMP, 2 * SIZE
+   li.d  TEMP, 2 * SIZE
    NOP
    slli.d INCX, INCX, ZBASE_SHIFT
    bge $r0,    N, .L999
diff --git a/kernel/loongarch64/zdot.S b/kernel/loongarch64/zdot.S
index 087c3845f..81ac19fbd 100644
--- a/kernel/loongarch64/zdot.S
+++ b/kernel/loongarch64/zdot.S
@@ -62,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    MOV s3, s2
    MOV s4, s3
    slli.d  INCX, INCX, ZBASE_SHIFT
-   li  TEMP, 2 * SIZE
+   li.d  TEMP, 2 * SIZE
    slli.d INCY, INCY, ZBASE_SHIFT
    bge $r0,    N, .L999
    srai.d I, N, 2
diff --git a/kernel/loongarch64/zgemv_n.S b/kernel/loongarch64/zgemv_n.S
index 0cc49c789..d995ce86b 100644
--- a/kernel/loongarch64/zgemv_n.S
+++ b/kernel/loongarch64/zgemv_n.S
@@ -123,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    bge $r0,    M, .L999
    slli.d INCY, INCY, ZBASE_SHIFT
    bge $r0,    N, .L999
-   li  I, 2 * SIZE
+   li.d  I, 2 * SIZE
    move   YORIG, Y
    beq INCY, I, .L10
    srai.d  I,  M, 2
@@ -576,7 +576,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    .align 3
 
 .L900:
-   li  YORIG, 2 * SIZE
+   li.d  YORIG, 2 * SIZE
    srai.d I,  M, 2
    beq INCY, YORIG, .L999
    move   XX, BUFFER
diff --git a/kernel/loongarch64/zgemv_t.S b/kernel/loongarch64/zgemv_t.S
index 85a9a0c0d..841823e1c 100644
--- a/kernel/loongarch64/zgemv_t.S
+++ b/kernel/loongarch64/zgemv_t.S
@@ -116,7 +116,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    bge $r0,    M, .L999
    slli.d INCY, INCY, ZBASE_SHIFT
    bge $r0,    N, .L999
-   li  I, 2 * SIZE
+   li.d  I, 2 * SIZE
    move   XORIG, X
    beq INCX, I, .L10
    srai.d  I,  M, 2
diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S
index fe53ed713..a12e527a5 100644
--- a/kernel/loongarch64/zscal.S
+++ b/kernel/loongarch64/zscal.S
@@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
    PROLOGUE
 
-   li  TEMP, 2 * SIZE
+   li.d  TEMP, 2 * SIZE
    MTC  a1, $r0
    slli.d INCX, INCX, ZBASE_SHIFT
    bge $r0,    N, .L999

From 989e6bbdd39fe3d49789b803c4fd6b20a3a673e5 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Fri, 13 Aug 2021 03:17:38 +0000
Subject: [PATCH 065/143] Small Matrix: reduce generic kernel source files

---
 kernel/CMakeLists.txt                         |  56 ++++-----
 kernel/Makefile.L3                            | 112 +++++++++---------
 .../generic/gemm_small_matrix_kernel_b0_nn.c  |  49 --------
 .../generic/gemm_small_matrix_kernel_b0_nt.c  |  49 --------
 .../generic/gemm_small_matrix_kernel_b0_tn.c  |  49 --------
 .../generic/gemm_small_matrix_kernel_b0_tt.c  |  49 --------
 kernel/generic/gemm_small_matrix_kernel_nn.c  |  11 +-
 kernel/generic/gemm_small_matrix_kernel_nt.c  |   9 +-
 kernel/generic/gemm_small_matrix_kernel_tn.c  |   8 ++
 kernel/generic/gemm_small_matrix_kernel_tt.c  |   8 ++
 .../generic/zgemm_small_matrix_kernel_b0_nn.c |  74 ------------
 .../generic/zgemm_small_matrix_kernel_b0_nt.c |  77 ------------
 .../generic/zgemm_small_matrix_kernel_b0_tn.c |  77 ------------
 .../generic/zgemm_small_matrix_kernel_b0_tt.c |  77 ------------
 kernel/generic/zgemm_small_matrix_kernel_nn.c |  11 ++
 kernel/generic/zgemm_small_matrix_kernel_nt.c |  11 ++
 kernel/generic/zgemm_small_matrix_kernel_tn.c |  11 ++
 kernel/generic/zgemm_small_matrix_kernel_tt.c |  11 ++
 18 files changed, 161 insertions(+), 588 deletions(-)
 delete mode 100644 kernel/generic/gemm_small_matrix_kernel_b0_nn.c
 delete mode 100644 kernel/generic/gemm_small_matrix_kernel_b0_nt.c
 delete mode 100644 kernel/generic/gemm_small_matrix_kernel_b0_tn.c
 delete mode 100644 kernel/generic/gemm_small_matrix_kernel_b0_tt.c
 delete mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_nn.c
 delete mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_nt.c
 delete mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_tn.c
 delete mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_tt.c

diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index 769a73b91..d8a230436 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -495,30 +495,30 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
       endif ()
       if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NN)
         if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
-          set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_b0_nn.c)
+          set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_nn.c)
         else ()
-          set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_b0_nn.c)
+          set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
         endif ()
       endif ()
       if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NT)
         if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
-          set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_b0_nt.c)
+          set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_nt.c)
         else ()
-          set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_b0_nt.c)
+          set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
         endif ()
       endif ()
       if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TN)
         if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
-          set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_b0_tn.c)
+          set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_tn.c)
         else ()
-          set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_b0_tn.c)
+          set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
         endif ()
       endif ()
       if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TT)
         if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
-          set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_b0_tt.c)
+          set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_tt.c)
         else ()
-          set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_b0_tt.c)
+          set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
         endif ()
       endif ()
 
@@ -541,32 +541,32 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
             GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false ${float_type})
             GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false ${float_type})
             GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR" "gemm_small_kernel_b0_nr" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN" "gemm_small_kernel_b0_rn" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR" "gemm_small_kernel_b0_rr" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC" "gemm_small_kernel_b0_nc" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT" "gemm_small_kernel_b0_rt" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC" "gemm_small_kernel_b0_rc" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR" "gemm_small_kernel_b0_tr" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN" "gemm_small_kernel_b0_cn" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR" "gemm_small_kernel_b0_cr" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC" "gemm_small_kernel_b0_tc" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT" "gemm_small_kernel_b0_ct" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC" "gemm_small_kernel_b0_cc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false ${float_type})
 
         else ()
             GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type})
             GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type})
             GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type})
             GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
-            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
         endif ()
       endif ()
 
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index f977793a0..ef11e391c 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -4334,32 +4334,32 @@ $(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
 ifndef DGEMM_SMALL_K_B0_NN
-DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c
+DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
 endif
 
 ifndef DGEMM_SMALL_K_B0_NT
-DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c
+DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
 endif
 
 ifndef DGEMM_SMALL_K_B0_TN
-DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c
+DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
 endif
 
 ifndef DGEMM_SMALL_K_B0_TT
-DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c
+DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
 endif
 
 $(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
 
 $(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
 
 $(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
 
 $(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
 
 ifndef SGEMM_SMALL_M_PERMIT
 SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
@@ -4397,32 +4397,32 @@ $(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
 ifndef SGEMM_SMALL_K_B0_NN
-SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c
+SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
 endif
 
 ifndef SGEMM_SMALL_K_B0_NT
-SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c
+SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
 endif
 
 ifndef SGEMM_SMALL_K_B0_TN
-SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c
+SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
 endif
 
 ifndef SGEMM_SMALL_K_B0_TT
-SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c
+SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
 endif
 
 $(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 
 $(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 
 $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 
 $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 
 ifndef CGEMM_SMALL_M_PERMIT
 CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
@@ -4496,68 +4496,68 @@ $(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
 
 ifndef CGEMM_SMALL_K_B0_NN
-CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c
+CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c
 endif
 
 ifndef CGEMM_SMALL_K_B0_NT
-CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c
+CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c
 endif
 
 ifndef CGEMM_SMALL_K_B0_TN
-CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c
+CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c
 endif
 
 ifndef CGEMM_SMALL_K_B0_TT
-CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c
+CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c
 endif
 
 $(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@
 	
 $(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@
 
 ifndef ZGEMM_SMALL_M_PERMIT
 ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
@@ -4632,65 +4632,65 @@ $(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
 
 ifndef ZGEMM_SMALL_K_B0_NN
-ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c
+ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c
 endif
 
 ifndef ZGEMM_SMALL_K_B0_NT
-ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c
+ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c
 endif
 
 ifndef ZGEMM_SMALL_K_B0_TN
-ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c
+ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c
 endif
 
 ifndef ZGEMM_SMALL_K_B0_TT
-ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c
+ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c
 endif
 
 $(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@
 	
 $(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@
diff --git a/kernel/generic/gemm_small_matrix_kernel_b0_nn.c b/kernel/generic/gemm_small_matrix_kernel_b0_nn.c
deleted file mode 100644
index 3be918017..000000000
--- a/kernel/generic/gemm_small_matrix_kernel_b0_nn.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/***************************************************************************
-Copyright (c) 2020, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
-{
-	//naive implemtation
-	//Column major
-	
-	BLASLONG i,j,k;
-	FLOAT result=0.0;
-
-	for(i=0; i<M; i++){
-		for(j=0; j<N; j++){
-			result=0.0;
-			for(k=0; k<K; k++){
-				result += A[i+k*lda] * B[k+j*ldb];
-			}
-			C[i+j*ldc]=alpha * result;
-		}
-	}
-	
-	return 0;
-}
diff --git a/kernel/generic/gemm_small_matrix_kernel_b0_nt.c b/kernel/generic/gemm_small_matrix_kernel_b0_nt.c
deleted file mode 100644
index 0ec2045c6..000000000
--- a/kernel/generic/gemm_small_matrix_kernel_b0_nt.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/***************************************************************************
-Copyright (c) 2020, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
-{
-	//naive implemtation
-	//Column major
-
-	BLASLONG i,j,k;
-	FLOAT result=0.0;
-
-	for(i=0; i<M; i++){
-		for(j=0; j<N; j++){
-			result=0.0;
-			for(k=0; k<K; k++){
-				result += A[i+k*lda] * B[k*ldb+j];
-			}
-			C[i+j*ldc]=alpha * result;
-		}
-	}
-	
-	return 0;
-}
diff --git a/kernel/generic/gemm_small_matrix_kernel_b0_tn.c b/kernel/generic/gemm_small_matrix_kernel_b0_tn.c
deleted file mode 100644
index f67035c72..000000000
--- a/kernel/generic/gemm_small_matrix_kernel_b0_tn.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/***************************************************************************
-Copyright (c) 2020, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
-{
-	//naive implemtation
-	//Column major
-
-	BLASLONG i,j,k;
-	FLOAT result=0.0;
-
-	for(i=0; i<M; i++){
-		for(j=0; j<N; j++){
-			result=0.0;
-			for(k=0; k<K; k++){
-				result += A[i*lda+k] * B[k+j*ldb];
-			}
-			C[i+j*ldc]=alpha * result;
-		}
-	}
-
-	return 0;
-}
diff --git a/kernel/generic/gemm_small_matrix_kernel_b0_tt.c b/kernel/generic/gemm_small_matrix_kernel_b0_tt.c
deleted file mode 100644
index 4e64e6b2a..000000000
--- a/kernel/generic/gemm_small_matrix_kernel_b0_tt.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/***************************************************************************
-Copyright (c) 2020, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
-{
-	//naive implemtation
-	//Column major
-
-	BLASLONG i,j,k;
-	FLOAT result=0.0;
-
-	for(i=0; i<M; i++){
-		for(j=0; j<N; j++){
-			result=0.0;
-			for(k=0; k<K; k++){
-				result += A[i*lda+k] * B[k*ldb+j];
-			}
-			C[i+j*ldc]=alpha * result;
-		}
-	}
-
-	return 0;
-}
diff --git a/kernel/generic/gemm_small_matrix_kernel_nn.c b/kernel/generic/gemm_small_matrix_kernel_nn.c
index efcc27cba..71700a1fa 100644
--- a/kernel/generic/gemm_small_matrix_kernel_nn.c
+++ b/kernel/generic/gemm_small_matrix_kernel_nn.c
@@ -27,11 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
+#else
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
 {
 	//naive implemtation
 	//Column major
-	
+
 	BLASLONG i,j,k;
 	FLOAT result=0.0;
 
@@ -41,9 +45,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			for(k=0; k<K; k++){
 				result += A[i+k*lda] * B[k+j*ldb];
 			}
+#ifdef B0
+			C[i+j*ldc]=alpha * result;
+#else
 			C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
+#endif
 		}
 	}
-	
 	return 0;
 }
diff --git a/kernel/generic/gemm_small_matrix_kernel_nt.c b/kernel/generic/gemm_small_matrix_kernel_nt.c
index e8d9a6c2c..b287b3837 100644
--- a/kernel/generic/gemm_small_matrix_kernel_nt.c
+++ b/kernel/generic/gemm_small_matrix_kernel_nt.c
@@ -27,7 +27,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
 {
 	//naive implemtation
 	//Column major
@@ -41,9 +45,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			for(k=0; k<K; k++){
 				result += A[i+k*lda] * B[k*ldb+j];
 			}
+#ifdef B0
+			C[i+j*ldc]=alpha * result;
+#else
 			C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
+#endif
 		}
 	}
-	
 	return 0;
 }
diff --git a/kernel/generic/gemm_small_matrix_kernel_tn.c b/kernel/generic/gemm_small_matrix_kernel_tn.c
index f7b7f2fcb..c41ea7211 100644
--- a/kernel/generic/gemm_small_matrix_kernel_tn.c
+++ b/kernel/generic/gemm_small_matrix_kernel_tn.c
@@ -27,7 +27,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
+#else
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
 {
 	//naive implemtation
 	//Column major
@@ -41,7 +45,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			for(k=0; k<K; k++){
 				result += A[i*lda+k] * B[k+j*ldb];
 			}
+#ifdef B0
+			C[i+j*ldc]=alpha * result;
+#else
 			C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
+#endif
 		}
 	}
 
diff --git a/kernel/generic/gemm_small_matrix_kernel_tt.c b/kernel/generic/gemm_small_matrix_kernel_tt.c
index 40a5b1b56..734510c67 100644
--- a/kernel/generic/gemm_small_matrix_kernel_tt.c
+++ b/kernel/generic/gemm_small_matrix_kernel_tt.c
@@ -27,7 +27,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
 {
 	//naive implemtation
 	//Column major
@@ -41,7 +45,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 			for(k=0; k<K; k++){
 				result += A[i*lda+k] * B[k*ldb+j];
 			}
+#ifdef B0
+			C[i+j*ldc]=alpha * result;
+#else
 			C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
+#endif
 		}
 	}
 
diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c
deleted file mode 100644
index 3ab057fef..000000000
--- a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/***************************************************************************
-Copyright (c) 2020, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
-{
-	FLOAT real, imag;
-
-	int i, j, l;
-	for(i = 0; i < M; i++){
-		for(j = 0; j < N; j++){
-			real=0;
-			imag=0;
-
-			for(l = 0; l < K; l++){
-#if defined(NN)
-				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]      
-					 -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
-
-				imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]        
-				       + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
-#elif defined(NR)
-				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]      
-					 +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
-
-				imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]      
-				       + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
-#elif defined(RN)
-				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
-					 +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
-
-				imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
-				       - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
-#elif defined(RR)
-				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
-					 -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
-
-				imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
-				       - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
-#endif
-			}
-
-			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
-			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
-		}
-	}
-	
-	return 0;
-}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c
deleted file mode 100644
index dc35f4a6d..000000000
--- a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/***************************************************************************
-Copyright (c) 2020, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
-{
-	FLOAT real, imag;
-	int i, j, l;
-	for(i = 0; i < M; i++){
-		for(j = 0; j < N; j++){
-			real=0;
-			imag=0;
-
-			for(l = 0; l < K; l++){
-#if defined(NT)
-				real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
-					 -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
-
-				imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]    
-				       + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
-
-#elif defined(NC)
-		                real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]          
-					 +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
-
-				imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
-				       + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
-
-#elif defined(RT)
-		                real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
-					 +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
-
-				imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]    
-				       - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
-
-#elif defined(RC)
-		                real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
-					 -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
-
-				imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
-				       - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
-
-#endif
-			}
-
-			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
-			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
-		}
-	}
-	
-	return 0;
-}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c
deleted file mode 100644
index 479a56e8f..000000000
--- a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/***************************************************************************
-Copyright (c) 2020, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
-{
-	FLOAT real, imag;
-	int i, j, l;
-	for(i = 0; i < M; i++){
-		for(j = 0; j < N; j++){
-			real=0;
-			imag=0;
-
-			for(l = 0; l < K; l++){
-#if defined(TN)
-				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
-					 -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
-
-				imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
-				       + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
-
-#elif defined(TR)
-				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
-					 +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
-
-				imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
-				       + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
-
-#elif defined(CN)
-				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
-					 +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
-
-				imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
-				       - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
-
-#elif defined(CR)
-				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
-					 -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
-
-				imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
-				       - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
-
-#endif
-			}
-
-			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
-			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
-		}
-	}
-	
-	return 0;
-}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c
deleted file mode 100644
index b698973dd..000000000
--- a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/***************************************************************************
-Copyright (c) 2020, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
-{
-	FLOAT real, imag;
-	int i, j, l;
-	for(i = 0; i < M; i++){
-		for(j = 0; j < N; j++){
-			real=0;
-			imag=0;
-
-			for(l = 0; l < K; l++){
-#if defined(TT)
-				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
-					 -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
-
-				imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
-				       + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
-
-#elif defined(TC)
-				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
-					 +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
-
-				imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
-				       + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
-
-#elif defined(CT)
-				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
-					 +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
-
-				imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
-				       - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
-
-#elif defined(CC)
-				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
-					 -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
-
-				imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
-				       - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
-
-#endif
-			}
-
-			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
-			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
-		}
-	}
-	
-	return 0;
-}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_nn.c b/kernel/generic/zgemm_small_matrix_kernel_nn.c
index 4bf6bf7ee..b830db228 100644
--- a/kernel/generic/zgemm_small_matrix_kernel_nn.c
+++ b/kernel/generic/zgemm_small_matrix_kernel_nn.c
@@ -27,10 +27,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#ifndef B0
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#endif
 {
 	FLOAT real, imag;
+#ifndef B0
 	FLOAT tmp0, tmp1;
+#endif
 	int i, j, l;
 	for(i = 0; i < M; i++){
 		for(j = 0; j < N; j++){
@@ -65,12 +71,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 #endif
 			}
 
+#ifndef B0
 			tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
 			tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
 
 
 			C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
 			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
+#else
+			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
+#endif
 		}
 	}
 	
diff --git a/kernel/generic/zgemm_small_matrix_kernel_nt.c b/kernel/generic/zgemm_small_matrix_kernel_nt.c
index 288e49c13..65c455ea9 100644
--- a/kernel/generic/zgemm_small_matrix_kernel_nt.c
+++ b/kernel/generic/zgemm_small_matrix_kernel_nt.c
@@ -27,10 +27,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#ifndef B0
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#endif
 {
 	FLOAT real, imag;
+#ifndef B0
 	FLOAT tmp0, tmp1;
+#endif
 	int i, j, l;
 	for(i = 0; i < M; i++){
 		for(j = 0; j < N; j++){
@@ -69,12 +75,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 #endif
 			}
 
+#ifndef B0
 			tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
 			tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
 
 
 			C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
 			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
+#else
+			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
+#endif
 		}
 	}
 	
diff --git a/kernel/generic/zgemm_small_matrix_kernel_tn.c b/kernel/generic/zgemm_small_matrix_kernel_tn.c
index 1e2a5aed4..356d42460 100644
--- a/kernel/generic/zgemm_small_matrix_kernel_tn.c
+++ b/kernel/generic/zgemm_small_matrix_kernel_tn.c
@@ -27,10 +27,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#ifndef B0
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#endif
 {
 	FLOAT real, imag;
+#ifndef B0
 	FLOAT tmp0, tmp1;
+#endif
 	int i, j, l;
 	for(i = 0; i < M; i++){
 		for(j = 0; j < N; j++){
@@ -69,12 +75,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 #endif
 			}
 
+#ifndef B0
 			tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
 			tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
 
 
 			C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
 			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
+#else
+			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
+#endif
 		}
 	}
 	
diff --git a/kernel/generic/zgemm_small_matrix_kernel_tt.c b/kernel/generic/zgemm_small_matrix_kernel_tt.c
index 180043539..39f18303b 100644
--- a/kernel/generic/zgemm_small_matrix_kernel_tt.c
+++ b/kernel/generic/zgemm_small_matrix_kernel_tt.c
@@ -27,10 +27,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#ifndef B0
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#endif
 {
 	FLOAT real, imag;
+#ifndef B0
 	FLOAT tmp0, tmp1;
+#endif
 	int i, j, l;
 	for(i = 0; i < M; i++){
 		for(j = 0; j < N; j++){
@@ -69,12 +75,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp
 #endif
 			}
 
+#ifndef B0
 			tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
 			tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
 
 
 			C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
 			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
+#else
+			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
+#endif
 		}
 	}
 	

From f9dba63c283308266e5a10dfaf486148a9111d7e Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Fri, 13 Aug 2021 03:28:44 +0000
Subject: [PATCH 066/143] Small Matrix: skylakex: remove unnecessary b0 source
 files

---
 kernel/x86_64/KERNEL.SKYLAKEX                    | 16 ++++++++--------
 .../x86_64/dgemm_small_kernel_b0_nn_skylakex.c   |  2 --
 .../x86_64/dgemm_small_kernel_b0_nt_skylakex.c   |  2 --
 .../x86_64/dgemm_small_kernel_b0_tn_skylakex.c   |  2 --
 .../x86_64/dgemm_small_kernel_b0_tt_skylakex.c   |  2 --
 .../x86_64/sgemm_small_kernel_b0_nn_skylakex.c   |  2 --
 .../x86_64/sgemm_small_kernel_b0_nt_skylakex.c   |  2 --
 .../x86_64/sgemm_small_kernel_b0_tn_skylakex.c   |  2 --
 .../x86_64/sgemm_small_kernel_b0_tt_skylakex.c   |  3 ---
 9 files changed, 8 insertions(+), 25 deletions(-)
 delete mode 100644 kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c
 delete mode 100644 kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c
 delete mode 100644 kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c
 delete mode 100644 kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c
 delete mode 100644 kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c
 delete mode 100644 kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c
 delete mode 100644 kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c
 delete mode 100644 kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index eb0cbaf98..6b4961bc2 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -12,13 +12,13 @@ STRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT =  ../generic/trsm_kernel_RT.c
 SGEMM_SMALL_M_PERMIT = sgemm_small_kernel_permit_skylakex.c
 SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c
-SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c
+SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_skylakex.c
 SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c
-SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_b0_nt_skylakex.c
+SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_skylakex.c
 SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_skylakex.c
-SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_b0_tn_skylakex.c
+SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_skylakex.c
 SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_skylakex.c
-SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_b0_tt_skylakex.c
+SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_skylakex.c
 
 DGEMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
 DTRMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
@@ -29,13 +29,13 @@ DGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
 DTRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
 DGEMM_SMALL_M_PERMIT = dgemm_small_kernel_permit_skylakex.c
 DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c
-DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c
+DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_skylakex.c
 DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c
-DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_b0_nt_skylakex.c
+DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_skylakex.c
 DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_skylakex.c
-DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_b0_tn_skylakex.c
+DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_skylakex.c
 DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_skylakex.c
-DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_b0_tt_skylakex.c
+DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_skylakex.c
 
 SGEMM_BETA = sgemm_beta_skylakex.c
 DGEMM_BETA = dgemm_beta_skylakex.c
diff --git a/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c
deleted file mode 100644
index a58738a25..000000000
--- a/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define B0 1
-#include "./dgemm_small_kernel_nn_skylakex.c"
diff --git a/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c
deleted file mode 100644
index eafe2ce49..000000000
--- a/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define B0 1
-#include "./dgemm_small_kernel_nt_skylakex.c"
diff --git a/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c
deleted file mode 100644
index 1dfa0aaf1..000000000
--- a/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define B0 1
-#include "./dgemm_small_kernel_tn_skylakex.c"
diff --git a/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c
deleted file mode 100644
index 93fab1836..000000000
--- a/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define B0 1
-#include "./dgemm_small_kernel_tt_skylakex.c"
diff --git a/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c
deleted file mode 100644
index 704e964b8..000000000
--- a/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define B0 1
-#include "./sgemm_small_kernel_nn_skylakex.c"
diff --git a/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c
deleted file mode 100644
index 6d7934be1..000000000
--- a/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define B0 1
-#include "./sgemm_small_kernel_nt_skylakex.c"
diff --git a/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c
deleted file mode 100644
index 0f9745b72..000000000
--- a/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define B0 1
-#include "./sgemm_small_kernel_tn_skylakex.c"
diff --git a/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c
deleted file mode 100644
index 27d9e0afd..000000000
--- a/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#define B0 1
-#define TT 1
-#include "./sgemm_small_kernel_tt_skylakex.c"

From 13d411677f4b0a617142b3fd4c15d7be4c442477 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 15 Aug 2021 00:17:23 +0200
Subject: [PATCH 067/143] Add more OSX build jobs to Azure CI (#3338)

* Add OSX build job with Homebrew OpenMP in a CMAKE build
* Check install step on OSX/gcc to make sure all include files are generated and installed as intended
* Add mixed clang/gfortran build with cmake on OSX
* move IOS ARMV7/ARMV8 crossbuilds from travis to azure
---
 azure-pipelines.yml | 56 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 889b920e3..b1bded639 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -83,6 +83,8 @@ jobs:
   - script: |
       brew update
       make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10
+      make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install
+      ls -lR ../blasinst
      
 - job: OSX_GCC_Nothreads
   pool:
@@ -104,6 +106,38 @@ jobs:
       brew install llvm libomp
       make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10
 
+- job: OSX_OpenMP_Clang_cmake
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
+     LIBRARY_PATH: /usr/local/opt/llvm/lib
+  steps:   
+  - script: |
+      brew update
+      brew install llvm libomp
+      mkdir build
+      cd build
+      cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNOFORTRAN=1 -DNO_AVX512=1 ..
+      make
+      ctest
+      
+- job: OSX_OpenMP_Clang_gf_cmake
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
+     LIBRARY_PATH: /usr/local/opt/llvm/lib
+  steps:   
+  - script: |
+      brew update
+      brew install llvm libomp
+      mkdir build
+      cd build
+      cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 ..
+      make
+      ctest
+        
 - job: OSX_Ifort_Clang
   pool:
      vmImage: 'macOS-10.15'
@@ -146,7 +180,27 @@ jobs:
       brew install --cask android-ndk
       export ANDROID_NDK_HOME=/usr/local/share/android-ndk
       make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
- 
+
+- job: OSX_IOS_ARMV8
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+     CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0
+  steps:
+  - script: |
+     make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
+
+- job: OSX_IOS_ARMV7
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+     CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1
+  steps:
+  - script: |
+     make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
+
 - job: ALPINE_MUSL
   pool:
      vmImage: 'ubuntu-latest'

From cdb5d2737e92d17c600903bf97ac32d1659ce324 Mon Sep 17 00:00:00 2001
From: Niyas Sait <niyas.sait@linaro.org>
Date: Mon, 16 Aug 2021 11:22:51 +0100
Subject: [PATCH 068/143] add support for building on windows/arm64 target

---
 common_arm64.h | 2 +-
 ctest.c        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/common_arm64.h b/common_arm64.h
index 2270ffba7..029e23886 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -120,7 +120,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 	.text ;
 	.p2align 2 ;
 	.global	REALNAME ;
-#ifndef __APPLE__
+#if !defined(__APPLE__) && !defined(_WIN32)
 	.type	REALNAME, %function ;
 #endif
 REALNAME:
diff --git a/ctest.c b/ctest.c
index 4f18918f5..2afd93f68 100644
--- a/ctest.c
+++ b/ctest.c
@@ -84,7 +84,7 @@ OS_AIX
 OS_OSF
 #endif
 
-#if defined(__WIN32) || defined(__WIN64) || defined(__WINNT)
+#if defined(__WIN32) || defined(__WIN64) || defined(_WIN32) || defined(_WIN64) || defined(__WINNT)
 OS_WINNT
 #endif
 
@@ -141,7 +141,7 @@ ARCH_SPARC
 ARCH_IA64
 #endif
 
-#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__)
+#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) || defined(__aarch64__)
 BINARY_64
 #endif
 

From c6c2a71fb7c4ea36558c911f964557b7ac3a35c8 Mon Sep 17 00:00:00 2001
From: Niyas Sait <niyas.sait@linaro.org>
Date: Mon, 16 Aug 2021 11:25:07 +0100
Subject: [PATCH 069/143] Fix ctest.h to build using clang on windows

---
 utest/ctest.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/utest/ctest.h b/utest/ctest.h
index 037f7f28d..79961badf 100644
--- a/utest/ctest.h
+++ b/utest/ctest.h
@@ -65,9 +65,14 @@ struct ctest {
 #undef CTEST_SEGFAULT
 #endif
 
-#if defined(_WIN32) && defined(_MSC_VER)
+#if defined(_WIN32)
+#if defined(__clang__)
+#define __CTEST_NO_TIME
+#undef CTEST_SEGFAULT
+#elif defined(_MSC_VER)
 #define __CTEST_MSVC
 #endif
+#endif
 
 //config for MSVC compiler
 #ifdef __CTEST_MSVC
@@ -286,7 +291,7 @@ void assert_dbl_far(double exp, double real, double tol, const char* caller, int
 #endif
 #include <stdint.h>
 
-#ifdef __CTEST_MSVC
+#ifdef _WIN32
 #include <io.h>
 #else
 #include <unistd.h>

From e9acb464318618009d13ddcc7e30dc300e878052 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Wed, 25 Aug 2021 07:07:27 +0000
Subject: [PATCH 070/143] sgemv: skylakex: bug fix for sgemv_t kernel in corner
 case

---
 kernel/x86_64/sgemv_t_4.c                     |  2 +-
 .../x86_64/sgemv_t_microk_skylakex_template.c | 23 ++++++++++---------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 76236cd16..a36c8ace9 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_t_microk_haswell-4.c"
 #elif defined (SKYLAKEX) || defined (COOPERLAKE)
 #include "sgemv_t_microk_haswell-4.c"
-/*#include "sgemv_t_microk_skylakex.c"*/
+#include "sgemv_t_microk_skylakex.c"
 #endif
 
 #if defined(STEAMROLLER) || defined(EXCAVATOR)
diff --git a/kernel/x86_64/sgemv_t_microk_skylakex_template.c b/kernel/x86_64/sgemv_t_microk_skylakex_template.c
index 34415054c..423413465 100644
--- a/kernel/x86_64/sgemv_t_microk_skylakex_template.c
+++ b/kernel/x86_64/sgemv_t_microk_skylakex_template.c
@@ -93,7 +93,7 @@ static int sgemv_kernel_t_1(BLASLONG m, float alpha, float *a, float *x, float *
             }
 
             if (tag_m_32x != m) {
-                for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_16x; idx_m+=32) {
+                for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) {
                     matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]);
             
                     _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0])));
@@ -145,8 +145,8 @@ static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float *
     }
     if (tag_m_32x != m) {
         for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) {
-            m0 = _mm512_loadu_ps(&a[idx_m]);
-            m1 = _mm512_loadu_ps(&a[idx_m + 16]);
+            m0 = _mm512_loadu_ps(&a[idx_m*2]);
+            m1 = _mm512_loadu_ps(&a[idx_m*2 + 16]);
             col1_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1);
             col1_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1);
             _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m])));
@@ -157,7 +157,7 @@ static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float *
             __mmask8 load_mask = *((__mmask8*) &load_mask_value);
             x1Array = _mm512_broadcast_f32x2(_mm_maskz_loadu_ps(load_mask, x));
             for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) {
-                m0 = _mm512_loadu_ps(&a[idx_m]);
+                m0 = _mm512_loadu_ps(&a[idx_m*2]);
                 m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR);
                 m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1);
                 __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0));
@@ -171,7 +171,7 @@ static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float *
                 unsigned char y_mask_value = (((unsigned char)0xff) >> (8-(m-tag_m_8x)));
                 __mmask8 y_mask = *((__mmask8*) &y_mask_value);
 
-                m0 = _mm512_maskz_loadu_ps(a_mask, &a[tag_m_8x]);
+                m0 = _mm512_maskz_loadu_ps(a_mask, &a[tag_m_8x*2]);
                 m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR);
                 m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1);
                 __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0));
@@ -346,7 +346,7 @@ static int sgemv_kernel_t_4(BLASLONG m, float alpha, float *a, float *x, float *
         c3 = _mm256_extractf32x4_ps(c256_2, 0);
         c4 = _mm256_extractf32x4_ps(c256_2, 1);
 
-        ret = _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, c1, c2), _mm_maskz_add_ps(0xff, c3, c4)), _mm_maskz_loadu_ps(0xff, y));
+        ret = _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, c1, c2), _mm_maskz_add_ps(0xff, c3, c4)), _mm_maskz_loadu_ps(0xff, &y[idx_m]));
         _mm_mask_storeu_ps(&y[idx_m], 0xff, ret);
     }
     
@@ -958,6 +958,7 @@ static int sgemv_kernel_t_7(BLASLONG m, float alpha, float *a, float *x, float *
                 c256_1 = _mm512_extractf32x8_ps(tmp0, 1);
 
                 c256_0 = _mm256_add_ps(c256_0, c256_1);
+                c256_0 = _mm256_mul_ps(c256_0, alpha256);
 
                 __m128 c128_0 = _mm256_extractf32x4_ps(c256_0, 0);
                 __m128 c128_1 = _mm256_extractf32x4_ps(c256_0, 1);
@@ -1016,9 +1017,10 @@ static int sgemv_kernel_t_8(BLASLONG m, float alpha, float *a, float *x, float *
     __m512 m0, m1, m2, m3;
     __m256 r0, r1, r2, r3, r4, r5, r6, r7, tmp0, tmp1, tmp2, tmp3;
     __m128 c128_0, c128_1, c128_2, c128_3;
-    __m128 alpha128 = _mm_set1_ps(alpha);
+    __m256 alpha256 = _mm256_set1_ps(alpha);
 
     __m256 x256 = _mm256_loadu_ps(x);
+    x256 = _mm256_mul_ps(x256, alpha256);
     __m512 x512 = _mm512_broadcast_f32x8(x256);
 
     for(BLASLONG idx_m=0; idx_m<tag_m_8x; idx_m+=8) {
@@ -1053,8 +1055,8 @@ static int sgemv_kernel_t_8(BLASLONG m, float alpha, float *a, float *x, float *
 
         c128_0 = _mm_add_ps(c128_0, c128_1);
         c128_2 = _mm_add_ps(c128_2, c128_3);
-        _mm_storeu_ps(&y[idx_m], _mm_fmadd_ps(c128_0, alpha128, _mm_loadu_ps(&y[idx_m])));        
-        _mm_storeu_ps(&y[idx_m+4], _mm_fmadd_ps(c128_2, alpha128, _mm_loadu_ps(&y[idx_m+4])));        
+        _mm_storeu_ps(&y[idx_m], _mm_add_ps(c128_0, _mm_loadu_ps(&y[idx_m])));
+        _mm_storeu_ps(&y[idx_m+4], _mm_add_ps(c128_2, _mm_loadu_ps(&y[idx_m+4])));
     }
 
     if (tag_m_8x !=m ){
@@ -1078,7 +1080,7 @@ static int sgemv_kernel_t_8(BLASLONG m, float alpha, float *a, float *x, float *
             c128_1 = _mm256_extractf32x4_ps(tmp1, 1);
 
             c128_0 = _mm_add_ps(c128_0, c128_1);
-            _mm_storeu_ps(&y[idx_m], _mm_fmadd_ps(c128_0, alpha128, _mm_loadu_ps(&y[idx_m])));
+            _mm_storeu_ps(&y[idx_m], _mm_add_ps(c128_0, _mm_loadu_ps(&y[idx_m])));
 
         }
 
@@ -1094,7 +1096,6 @@ static int sgemv_kernel_t_8(BLASLONG m, float alpha, float *a, float *x, float *
                 c128_1 = _mm256_extractf32x4_ps(tmp0, 1);                
 
                 c128_0 = _mm_add_ps(c128_0, c128_1);
-                c128_0 = _mm_mul_ps(c128_0, alpha128);
 
                 _mm_storeu_ps(ret, c128_0);
                 y[idx_m] += (ret[0]+ret[1]);

From dbbb39199f31f2aca1d8e6a09fa44f30f25a5a03 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Wed, 25 Aug 2021 07:13:00 +0000
Subject: [PATCH 071/143] sgemv: skylakex: fix build warning

---
 kernel/x86_64/sgemv_n_4.c                        | 3 ---
 kernel/x86_64/sgemv_t_microk_skylakex_template.c | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 06de28d97..90865c4b3 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -302,9 +302,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
         FLOAT * xbuffer_align = x;
         FLOAT * ybuffer_align = y;
 
-        FLOAT * xbuffer = NULL;
-        FLOAT * ybuffer = NULL;
-
         if (inc_x != 1) {
             xbuffer_align = buffer;
             for(BLASLONG i=0; i<n; i++) {
diff --git a/kernel/x86_64/sgemv_t_microk_skylakex_template.c b/kernel/x86_64/sgemv_t_microk_skylakex_template.c
index 423413465..7f2144353 100644
--- a/kernel/x86_64/sgemv_t_microk_skylakex_template.c
+++ b/kernel/x86_64/sgemv_t_microk_skylakex_template.c
@@ -166,7 +166,7 @@ static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float *
             }
 
             if (tag_m_8x != m) {
-                unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-((m-tag_m_8x)*2)&15));
+                unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(((m-tag_m_8x)*2)&15)));
                 __mmask16 a_mask = *((__mmask16*) &tail_mask_value);
                 unsigned char y_mask_value = (((unsigned char)0xff) >> (8-(m-tag_m_8x)));
                 __mmask8 y_mask = *((__mmask8*) &y_mask_value);
@@ -322,7 +322,7 @@ static int sgemv_kernel_t_4(BLASLONG m, float alpha, float *a, float *x, float *
 {
     BLASLONG tag_m_4x = m & (~3);
     BLASLONG tag_m_2x = m & (~1);
-    __m512 m0, m1, m2;
+    __m512 m0, m1;
     __m256 m256_0, m256_1, c256_1, c256_2;
     __m128 c1, c2, c3, c4, ret;
     __m128 xarray = _mm_maskz_loadu_ps(0x0f, x);

From 7d1becc575d436039f1484259a10413aade9cda9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 28 Aug 2021 14:18:36 +0200
Subject: [PATCH 072/143] Allocate an auxiliary struct when running out of
 preconfigured threads

---
 driver/others/memory.c | 145 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 142 insertions(+), 3 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 460a3d557..377e073ee 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2636,8 +2636,25 @@ static volatile struct {
 
 } memory[NUM_BUFFERS];
 
-static int memory_initialized = 0;
+static volatile struct newmemstruct 
+{
+  BLASULONG lock;
+  void *addr;
+#if defined(WHEREAMI) && !defined(USE_OPENMP)
+  int   pos;
+#endif
+  int used;
+#ifndef __64BIT__
+  char dummy[48];
+#else
+  char dummy[40];
+#endif
 
+};
+static volatile struct newmemstruct *newmemory;
+
+static int memory_initialized = 0;
+static int memory_overflowed = 0;
 /*       Memory allocation routine           */
 /* procpos ... indicates where it comes from */
 /*                0 : Level 3 functions      */
@@ -2779,6 +2796,29 @@ void *blas_memory_alloc(int procpos){
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);
 #endif
+  if (memory_overflowed) {
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
+  LOCK_COMMAND(&alloc_lock);
+#endif
+  do {
+    RMB;
+#if defined(USE_OPENMP)
+    if (!newmemory[position-NUM_BUFFERS].used) {
+      blas_lock(&newmemory[position-NUM_BUFFERS].lock);
+#endif
+      if (!newmemory[position-NUM_BUFFERS].used) goto allocation2;
+
+#if defined(USE_OPENMP)
+      blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
+    }
+#endif
+    position ++;
+
+  } while (position < 512+NUM_BUFFERS);
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
+  UNLOCK_COMMAND(&alloc_lock);
+#endif
+} 
   goto error;
 
   allocation :
@@ -2883,6 +2923,90 @@ void *blas_memory_alloc(int procpos){
   return (void *)memory[position].addr;
 
  error:
+ if (memory_overflowed) goto terminate;
+ printf("num_buffers exceeded, adding auxiliary array\n");
+  memory_overflowed=1;
+  newmemory= (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
+  for (int i=0;i<512;i++) {
+  newmemory[i].addr   = (void *)0;
+#if defined(WHEREAMI) && !defined(USE_OPENMP)
+  newmemory[i].pos    = -1;
+#endif
+  newmemory[i].used   = 0;
+  newmemory[i].lock   = 0;
+}
+  newmemory[position-NUM_BUFFERS].used = 1;
+  
+allocation2:
+  newmemory[position-NUM_BUFFERS].used = 1;
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
+  UNLOCK_COMMAND(&alloc_lock);
+#else
+  blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
+#endif
+    do {
+#ifdef DEBUG
+      printf("Allocation Start : %lx\n", base_address);
+#endif
+
+      map_address = (void *)-1;
+
+      func = &memoryalloc[0];
+
+      while ((func != NULL) && (map_address == (void *) -1)) {
+
+        map_address = (*func)((void *)base_address);
+
+#ifdef ALLOC_DEVICEDRIVER
+        if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
+            fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
+        }
+#endif
+
+#ifdef ALLOC_HUGETLBFILE
+        if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
+#ifndef OS_WINDOWS
+            fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
+#endif
+        }
+#endif
+
+#if (defined ALLOC_SHM) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
+        if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
+#endif
+
+        func ++;
+      }
+
+#ifdef DEBUG
+      printf("  Success -> %08lx\n", map_address);
+#endif
+      if (((BLASLONG) map_address) == -1) base_address = 0UL;
+
+      if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
+
+    } while ((BLASLONG)map_address == -1);
+
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
+    LOCK_COMMAND(&alloc_lock);
+#endif
+    newmemory[position-NUM_BUFFERS].addr = map_address;
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
+    UNLOCK_COMMAND(&alloc_lock);
+#endif
+
+//#ifdef DEBUG
+    printf("  Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position);
+//#endif
+
+#if defined(WHEREAMI) && !defined(USE_OPENMP)
+
+  if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos;
+
+#endif
+  return (void *)newmemory[position-NUM_BUFFERS].addr;
+
+terminate:
   printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
   printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
   printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
@@ -2907,13 +3031,28 @@ void blas_memory_free(void *free_area){
   while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
     position++;
 
-  if (position >= NUM_BUFFERS) goto error;
+  if (position >= NUM_BUFFERS && !memory_overflowed) goto error;
 
 #ifdef DEBUG
   if (memory[position].addr != free_area) goto error;
   printf("  Position : %d\n", position);
 #endif
+  if (memory_overflowed) {
+    while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area))
+      position++;
+  // arm: ensure all writes are finished before other thread takes this memory
+  WMB;
 
+  newmemory[position].used = 0;
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
+  UNLOCK_COMMAND(&alloc_lock);
+#endif
+
+//#ifdef DEBUG
+  printf("Unmap from overflow area succeeded.\n\n");
+//#endif
+  return;
+} else {
   // arm: ensure all writes are finished before other thread takes this memory
   WMB;
 
@@ -2927,7 +3066,7 @@ void blas_memory_free(void *free_area){
 #endif
 
   return;
-
+}
  error:
   printf("BLAS : Bad memory unallocation! : %4d  %p\n", position,  free_area);
 

From b4b952eece8344fe5d7adf2352791ab81d0d1d8d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 28 Aug 2021 17:03:53 +0200
Subject: [PATCH 073/143] Add auxiliary tracking space for thread buffer frees
 too

---
 driver/others/memory.c | 68 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 2 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 377e073ee..d4fdfa465 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2060,6 +2060,7 @@ struct release_t {
 int hugetlb_allocated = 0;
 
 static struct release_t release_info[NUM_BUFFERS];
+static struct release_t *new_release_info;
 static int release_pos = 0;
 
 #if defined(OS_LINUX) && !defined(NO_WARMUP)
@@ -2110,8 +2111,13 @@ static void *alloc_mmap(void *address){
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     LOCK_COMMAND(&alloc_lock);
 #endif
+    if (release_pos < NUM_BUFFERS) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_mmap_free;
+    }
     release_pos ++;
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     UNLOCK_COMMAND(&alloc_lock);
@@ -2274,8 +2280,13 @@ static void *alloc_mmap(void *address){
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     LOCK_COMMAND(&alloc_lock);
 #endif
+    if (release_pos < NUM_BUFFERS) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
+    { else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_mmap_free;
+    }
     release_pos ++;
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     UNLOCK_COMMAND(&alloc_lock);
@@ -2307,8 +2318,13 @@ static void *alloc_malloc(void *address){
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
   if (map_address != (void *)-1) {
+    if (release_pos < NUM_BUFFERS) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_malloc_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_malloc_free;
+    }
     release_pos ++;
   }
 
@@ -2341,8 +2357,13 @@ static void *alloc_qalloc(void *address){
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
   if (map_address != (void *)-1) {
+    if (release_pos < NUM_BUFFERS) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_qalloc_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_qalloc_free;
+    }
     release_pos ++;
   }
 
@@ -2370,8 +2391,13 @@ static void *alloc_windows(void *address){
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
   if (map_address != (void *)-1) {
+    if (release_pos < NUM_BUFFERS) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_windows_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_windows_free;
+    }
     release_pos ++;
   }
 
@@ -2414,9 +2440,15 @@ static void *alloc_devicedirver(void *address){
                      fd, 0);
 
   if (map_address != (void *)-1) {
+    if (release_pos < NUM_BUFFERS) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].attr    = fd;
     release_info[release_pos].func    = alloc_devicedirver_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].attr    = fd;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_devicedirver_free;
+    }
     release_pos ++;
   }
 
@@ -2450,9 +2482,15 @@ static void *alloc_shm(void *address){
 
     shmctl(shmid, IPC_RMID, 0);
 
+    if (release_pos < NUM_BUFFERS) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].attr    = shmid;
     release_info[release_pos].func    = alloc_shm_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].attr    = shmid;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_shm_free;
+    }
     release_pos ++;
   }
 
@@ -2556,8 +2594,13 @@ static void *alloc_hugetlb(void *address){
 #endif
 
   if (map_address != (void *)-1){
+    if (release_pos < NUM_BUFFERS) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_hugetlb_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_hugetlb_free;
+    }
     release_pos ++;
   }
 
@@ -2604,9 +2647,15 @@ static void *alloc_hugetlbfile(void *address){
                      fd, 0);
 
   if (map_address != (void *)-1) {
+    if (release_pos < NUM_BUFFERS) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].attr    = fd;
     release_info[release_pos].func    = alloc_hugetlbfile_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].attr    = fd;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_hugetlbfile_free;
+    }
     release_pos ++;
   }
 
@@ -2663,6 +2712,8 @@ static int memory_overflowed = 0;
 
 void *blas_memory_alloc(int procpos){
 
+  int i;
+  
   int position;
 #if defined(WHEREAMI) && !defined(USE_OPENMP)
   int mypos = 0;
@@ -2926,8 +2977,9 @@ void *blas_memory_alloc(int procpos){
  if (memory_overflowed) goto terminate;
  printf("num_buffers exceeded, adding auxiliary array\n");
   memory_overflowed=1;
-  newmemory= (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
-  for (int i=0;i<512;i++) {
+  new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
+  newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
+  for (i = 0; i < 512; i++) {
   newmemory[i].addr   = (void *)0;
 #if defined(WHEREAMI) && !defined(USE_OPENMP)
   newmemory[i].pos    = -1;
@@ -3101,7 +3153,10 @@ void blas_shutdown(void){
   LOCK_COMMAND(&alloc_lock);
 
   for (pos = 0; pos < release_pos; pos ++) {
+    if (pos < NUM_BUFFERS)
     release_info[pos].func(&release_info[pos]);
+    else
+    new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]);
   }
 
 #ifdef SEEK_ADDRESS
@@ -3118,6 +3173,15 @@ void blas_shutdown(void){
 #endif
     memory[pos].lock   = 0;
   }
+  if (memory_overflowed)
+    for (pos = 0; pos < 512; pos ++){
+      newmemory[pos].addr   = (void *)0;
+      newmemory[pos].used   = 0;
+#if defined(WHEREAMI) && !defined(USE_OPENMP)
+      newmemory[pos].pos    = -1;
+#endif
+      newmemory[pos].lock   = 0;
+  }
 
   UNLOCK_COMMAND(&alloc_lock);
 

From 2ba9a567aaaac875be19a76009853b2ee4597dbc Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 28 Aug 2021 17:14:59 +0200
Subject: [PATCH 074/143] Fix typo

---
 driver/others/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index d4fdfa465..3825e83ae 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2283,7 +2283,7 @@ static void *alloc_mmap(void *address){
     if (release_pos < NUM_BUFFERS) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
-    { else {
+    } else {
     new_release_info[release_pos-NUM_BUFFERS].address = map_address;
     new_release_info[release_pos-NUM_BUFFERS].func    = alloc_mmap_free;
     }

From 7fd12a5e69164b62dad7fbddf1581d941e5339fa Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 29 Aug 2021 13:54:51 +0200
Subject: [PATCH 075/143] Add likely() hints for gcc

---
 driver/others/memory.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 3825e83ae..689aba942 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -73,6 +73,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#ifndef likely
+#ifdef __GNUC__
+#define likely(x) __builtin_expect(!!(x), 1)
+#else
+#define likely(x) (x)
+#endif
+#endif
+
 #if defined(USE_TLS) && defined(SMP)
 #define COMPILE_TLS
 
@@ -2111,7 +2119,7 @@ static void *alloc_mmap(void *address){
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     LOCK_COMMAND(&alloc_lock);
 #endif
-    if (release_pos < NUM_BUFFERS) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
     } else {
@@ -2280,7 +2288,7 @@ static void *alloc_mmap(void *address){
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     LOCK_COMMAND(&alloc_lock);
 #endif
-    if (release_pos < NUM_BUFFERS) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
     } else {
@@ -2318,7 +2326,7 @@ static void *alloc_malloc(void *address){
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
   if (map_address != (void *)-1) {
-    if (release_pos < NUM_BUFFERS) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_malloc_free;
     } else {
@@ -2357,7 +2365,7 @@ static void *alloc_qalloc(void *address){
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
   if (map_address != (void *)-1) {
-    if (release_pos < NUM_BUFFERS) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_qalloc_free;
     } else {
@@ -2391,7 +2399,7 @@ static void *alloc_windows(void *address){
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
   if (map_address != (void *)-1) {
-    if (release_pos < NUM_BUFFERS) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_windows_free;
     } else {
@@ -2440,7 +2448,7 @@ static void *alloc_devicedirver(void *address){
                      fd, 0);
 
   if (map_address != (void *)-1) {
-    if (release_pos < NUM_BUFFERS) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].attr    = fd;
     release_info[release_pos].func    = alloc_devicedirver_free;
@@ -2482,7 +2490,7 @@ static void *alloc_shm(void *address){
 
     shmctl(shmid, IPC_RMID, 0);
 
-    if (release_pos < NUM_BUFFERS) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].attr    = shmid;
     release_info[release_pos].func    = alloc_shm_free;
@@ -2594,7 +2602,7 @@ static void *alloc_hugetlb(void *address){
 #endif
 
   if (map_address != (void *)-1){
-    if (release_pos < NUM_BUFFERS) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_hugetlb_free;
     } else {
@@ -2647,7 +2655,7 @@ static void *alloc_hugetlbfile(void *address){
                      fd, 0);
 
   if (map_address != (void *)-1) {
-    if (release_pos < NUM_BUFFERS) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].attr    = fd;
     release_info[release_pos].func    = alloc_hugetlbfile_free;
@@ -3153,7 +3161,7 @@ void blas_shutdown(void){
   LOCK_COMMAND(&alloc_lock);
 
   for (pos = 0; pos < release_pos; pos ++) {
-    if (pos < NUM_BUFFERS)
+    if (likely(pos < NUM_BUFFERS))
     release_info[pos].func(&release_info[pos]);
     else
     new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]);

From 89fc5b8f4f1c56b50896773e667c3a215342e49c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 29 Aug 2021 19:50:24 +0200
Subject: [PATCH 076/143] Fix unmap logic

---
 driver/others/memory.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 689aba942..1f66ef9e9 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -76,8 +76,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef likely
 #ifdef __GNUC__
 #define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
 #else
 #define likely(x) (x)
+#define unlikely(x) (x)
 #endif
 #endif
 
@@ -3097,7 +3099,7 @@ void blas_memory_free(void *free_area){
   if (memory[position].addr != free_area) goto error;
   printf("  Position : %d\n", position);
 #endif
-  if (memory_overflowed) {
+  if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) {
     while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area))
       position++;
   // arm: ensure all writes are finished before other thread takes this memory

From 1d83ca4bca890536f1c7713a3432a9daf59d2c2c Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 12 Aug 2021 03:14:18 +0000
Subject: [PATCH 077/143] Small Matrix: support BFLOAT16 data type

---
 common_level3.h                              | 12 ++++
 common_macro.h                               | 18 ++---
 common_param.h                               | 13 ++++
 common_sb.h                                  | 12 ++++
 interface/gemm.c                             |  6 +-
 kernel/Makefile.L3                           | 75 ++++++++++++++++++++
 kernel/generic/gemm_small_matrix_kernel_nn.c |  4 +-
 kernel/generic/gemm_small_matrix_kernel_nt.c |  4 +-
 kernel/generic/gemm_small_matrix_kernel_tn.c |  4 +-
 kernel/generic/gemm_small_matrix_kernel_tt.c |  4 +-
 kernel/setparam-ref.c                        |  5 ++
 11 files changed, 137 insertions(+), 20 deletions(-)

diff --git a/common_level3.h b/common_level3.h
index 187402a9a..5080ada10 100644
--- a/common_level3.h
+++ b/common_level3.h
@@ -516,6 +516,13 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xd
 #endif
 
 #ifdef SMALL_MATRIX_OPT
+int sbgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
+
+int sbgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+int sbgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+int sbgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+int sbgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+
 int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
 
 int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
@@ -530,6 +537,11 @@ int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLO
 int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
 int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
 
+int sbgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sbgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sbgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sbgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+
 int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
 int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
 int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
diff --git a/common_macro.h b/common_macro.h
index aeb9a205b..cf2a3fd88 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -942,17 +942,17 @@
 
 #define GEADD_K 		SGEADD_K
 
-#define GEMM_SMALL_MATRIX_PERMIT	SGEMM_SMALL_MATRIX_PERMIT
+#define GEMM_SMALL_MATRIX_PERMIT	SBGEMM_SMALL_MATRIX_PERMIT
 
-#define GEMM_SMALL_KERNEL_NN    SGEMM_SMALL_KERNEL_NN
-#define GEMM_SMALL_KERNEL_NT    SGEMM_SMALL_KERNEL_NT
-#define GEMM_SMALL_KERNEL_TN    SGEMM_SMALL_KERNEL_TN
-#define GEMM_SMALL_KERNEL_TT    SGEMM_SMALL_KERNEL_TT
+#define GEMM_SMALL_KERNEL_NN    SBGEMM_SMALL_KERNEL_NN
+#define GEMM_SMALL_KERNEL_NT    SBGEMM_SMALL_KERNEL_NT
+#define GEMM_SMALL_KERNEL_TN    SBGEMM_SMALL_KERNEL_TN
+#define GEMM_SMALL_KERNEL_TT    SBGEMM_SMALL_KERNEL_TT
 
-#define GEMM_SMALL_KERNEL_B0_NN    SGEMM_SMALL_KERNEL_B0_NN
-#define GEMM_SMALL_KERNEL_B0_NT    SGEMM_SMALL_KERNEL_B0_NT
-#define GEMM_SMALL_KERNEL_B0_TN    SGEMM_SMALL_KERNEL_B0_TN
-#define GEMM_SMALL_KERNEL_B0_TT    SGEMM_SMALL_KERNEL_B0_TT
+#define GEMM_SMALL_KERNEL_B0_NN    SBGEMM_SMALL_KERNEL_B0_NN
+#define GEMM_SMALL_KERNEL_B0_NT    SBGEMM_SMALL_KERNEL_B0_NT
+#define GEMM_SMALL_KERNEL_B0_TN    SBGEMM_SMALL_KERNEL_B0_TN
+#define GEMM_SMALL_KERNEL_B0_TT    SBGEMM_SMALL_KERNEL_B0_TT
 
 #endif
 
diff --git a/common_param.h b/common_param.h
index 7e8bea4fe..31fba9059 100644
--- a/common_param.h
+++ b/common_param.h
@@ -145,6 +145,19 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
   int	 (*sbneg_tcopy)   (BLASLONG, BLASLONG, float *, BLASLONG, float *);
   int    (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
 
+#ifdef SMALL_MATRIX_OPT
+  int    (*sbgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
+
+  int    (*sbgemm_small_kernel_nn    )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+  int    (*sbgemm_small_kernel_nt    )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+  int    (*sbgemm_small_kernel_tn    )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+  int    (*sbgemm_small_kernel_tt    )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+
+  int    (*sbgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+  int    (*sbgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+  int    (*sbgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+  int    (*sbgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+#endif
 #endif
 
 #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
diff --git a/common_sb.h b/common_sb.h
index 9976e812e..d21e7a563 100644
--- a/common_sb.h
+++ b/common_sb.h
@@ -24,6 +24,7 @@
 #define	SBGEMM_BETA		sbgemm_beta
 #define SBGEMM_KERNEL            sbgemm_kernel
 
+#define SBGEMM_SMALL_MATRIX_PERMIT	sbgemm_small_matrix_permit
 #else
 
 #define SBDOT_K             gotoblas -> sbdot_k
@@ -41,8 +42,19 @@
 #define	SBGEMM_BETA		gotoblas -> sbgemm_beta
 #define	SBGEMM_KERNEL		gotoblas -> sbgemm_kernel
 
+#define SBGEMM_SMALL_MATRIX_PERMIT	gotoblas -> sbgemm_small_matrix_permit
 #endif
 
+#define SBGEMM_SMALL_KERNEL_NN		FUNC_OFFSET(sbgemm_small_kernel_nn)
+#define SBGEMM_SMALL_KERNEL_NT		FUNC_OFFSET(sbgemm_small_kernel_nt)
+#define SBGEMM_SMALL_KERNEL_TN		FUNC_OFFSET(sbgemm_small_kernel_tn)
+#define SBGEMM_SMALL_KERNEL_TT		FUNC_OFFSET(sbgemm_small_kernel_tt)
+
+#define SBGEMM_SMALL_KERNEL_B0_NN	FUNC_OFFSET(sbgemm_small_kernel_b0_nn)
+#define SBGEMM_SMALL_KERNEL_B0_NT	FUNC_OFFSET(sbgemm_small_kernel_b0_nt)
+#define SBGEMM_SMALL_KERNEL_B0_TN	FUNC_OFFSET(sbgemm_small_kernel_b0_tn)
+#define SBGEMM_SMALL_KERNEL_B0_TT	FUNC_OFFSET(sbgemm_small_kernel_b0_tt)
+
 #define	SBGEMM_NN		sbgemm_nn
 #define	SBGEMM_CN		sbgemm_tn
 #define	SBGEMM_TN		sbgemm_tn
diff --git a/interface/gemm.c b/interface/gemm.c
index 3497d8651..47e0ca0c3 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -105,7 +105,7 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
 #endif
 };
 
-#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) && !defined(BFLOAT16)
+#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE)
 #define USE_SMALL_MATRIX_OPT 1
 #else
 #define USE_SMALL_MATRIX_OPT 0
@@ -131,8 +131,8 @@ static size_t gemm_small_kernel_b0[] = {
 	GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0,
 };
 
-#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx))
-#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx))
+#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx))
+#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx))
 #else
 
 static size_t zgemm_small_kernel[] = {
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index ef11e391c..404f774cc 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -450,6 +450,15 @@ endif
 ######  BLAS small matrix optimization #####
 ifeq ($(SMALL_MATRIX_OPT), 1)
 
+ifeq ($(BUILD_BFLOAT16),1)
+SBBLASOBJS += \
+	sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
+	sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
+	sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
+	sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
+	sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
+endif
+
 SBLASOBJS += \
 	sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
 	sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
@@ -4424,6 +4433,72 @@ $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL
 $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 
+
+ifeq ($(BUILD_BFLOAT16), 1)
+ifndef SBGEMM_SMALL_M_PERMIT
+SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
+endif
+
+ifndef SBGEMM_SMALL_K_NN
+SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
+endif
+
+ifndef SBGEMM_SMALL_K_NT
+SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
+endif
+
+ifndef SBGEMM_SMALL_K_TN
+SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
+endif
+
+ifndef SBGEMM_SMALL_K_TT
+SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+ifndef SBGEMM_SMALL_K_B0_NN
+SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c
+endif
+
+ifndef SBGEMM_SMALL_K_B0_NT
+SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c
+endif
+
+ifndef SBGEMM_SMALL_K_B0_TN
+SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c
+endif
+
+ifndef SBGEMM_SMALL_K_B0_TT
+SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c
+endif
+
+$(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+endif
+
 ifndef CGEMM_SMALL_M_PERMIT
 CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
 endif
diff --git a/kernel/generic/gemm_small_matrix_kernel_nn.c b/kernel/generic/gemm_small_matrix_kernel_nn.c
index 71700a1fa..b0638c7ea 100644
--- a/kernel/generic/gemm_small_matrix_kernel_nn.c
+++ b/kernel/generic/gemm_small_matrix_kernel_nn.c
@@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #ifdef B0
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc)
 #else
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc)
 #endif
 {
 	//naive implemtation
diff --git a/kernel/generic/gemm_small_matrix_kernel_nt.c b/kernel/generic/gemm_small_matrix_kernel_nt.c
index b287b3837..0a965db58 100644
--- a/kernel/generic/gemm_small_matrix_kernel_nt.c
+++ b/kernel/generic/gemm_small_matrix_kernel_nt.c
@@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #ifdef B0
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc)
 #else
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc)
 #endif
 {
 	//naive implemtation
diff --git a/kernel/generic/gemm_small_matrix_kernel_tn.c b/kernel/generic/gemm_small_matrix_kernel_tn.c
index c41ea7211..69ffc718c 100644
--- a/kernel/generic/gemm_small_matrix_kernel_tn.c
+++ b/kernel/generic/gemm_small_matrix_kernel_tn.c
@@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #ifdef B0
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc)
 #else
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc)
 #endif
 {
 	//naive implemtation
diff --git a/kernel/generic/gemm_small_matrix_kernel_tt.c b/kernel/generic/gemm_small_matrix_kernel_tt.c
index 734510c67..9d68de3f9 100644
--- a/kernel/generic/gemm_small_matrix_kernel_tt.c
+++ b/kernel/generic/gemm_small_matrix_kernel_tt.c
@@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #ifdef B0
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc)
 #else
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc)
 #endif
 {
 	//naive implemtation
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index f303d0dc6..19b7b5f0b 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -112,6 +112,11 @@ gotoblas_t TABLE_NAME = {
 #else
   NULL,NULL,
 #endif
+#ifdef SMALL_MATRIX_OPT
+  sbgemm_small_matrix_permitTS,
+  sbgemm_small_kernel_nnTS, sbgemm_small_kernel_ntTS, sbgemm_small_kernel_tnTS, sbgemm_small_kernel_ttTS,
+  sbgemm_small_kernel_b0_nnTS, sbgemm_small_kernel_b0_ntTS, sbgemm_small_kernel_b0_tnTS, sbgemm_small_kernel_b0_ttTS,
+#endif
 #endif
 
 #if ( BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1)

From 7d27b182fc6cb2d1b8fc7967c40dd89727fcf875 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 12 Aug 2021 06:10:51 +0000
Subject: [PATCH 078/143] sbgemm: cooperlake: enable SBGEMM by small matrix
 path

---
 kernel/x86_64/KERNEL.COOPERLAKE               | 10 +++
 .../x86_64/sbgemm_block_microk_cooperlake.c   | 19 +---
 .../sbgemm_microk_cooperlake_template.c       |  5 +-
 .../sbgemm_small_kernel_b0_nn_cooperlake.c    |  2 +
 .../sbgemm_small_kernel_b0_nt_cooperlake.c    |  2 +
 .../sbgemm_small_kernel_b0_tn_cooperlake.c    |  2 +
 .../sbgemm_small_kernel_b0_tt_cooperlake.c    |  2 +
 .../sbgemm_small_kernel_nn_cooperlake.c       |  2 +
 .../sbgemm_small_kernel_nt_cooperlake.c       |  2 +
 .../sbgemm_small_kernel_permit_cooperlake.c   | 42 +++++++++
 .../sbgemm_small_kernel_template_cooperlake.c | 89 +++++++++++++++++++
 .../sbgemm_small_kernel_tn_cooperlake.c       |  2 +
 .../sbgemm_small_kernel_tt_cooperlake.c       |  2 +
 13 files changed, 162 insertions(+), 19 deletions(-)
 create mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c

diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE
index 0b2f3c0ed..151c02d5a 100644
--- a/kernel/x86_64/KERNEL.COOPERLAKE
+++ b/kernel/x86_64/KERNEL.COOPERLAKE
@@ -1 +1,11 @@
 include $(KERNELDIR)/KERNEL.SKYLAKEX
+
+SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c
+SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c
+SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_b0_nn_cooperlake.c
+SBGEMM_SMALL_K_NT = sbgemm_small_kernel_nt_cooperlake.c
+SBGEMM_SMALL_K_B0_NT = sbgemm_small_kernel_b0_nt_cooperlake.c
+SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c
+SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_b0_tn_cooperlake.c
+SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c
+SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_b0_tt_cooperlake.c
diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c
index 147c5ebdd..2c27221ac 100644
--- a/kernel/x86_64/sbgemm_block_microk_cooperlake.c
+++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c
@@ -1,6 +1,5 @@
-//#include "sbgemm.h"
-
 #include <immintrin.h>
+
 // Walk around those intrinsics that missed by compiler
 #define MM256_LOADU_EPI16(addr)   \
             _mm256_maskz_loadu_epi16(~0, (addr))
@@ -1747,7 +1746,7 @@ void COL_MAJOR_OTCOPY_KERNEL_Kx8m(BLASLONG k, BLASLONG n, bfloat16 * B, BLASLONG
 }
 
 // Scale matrix C when beta is not ZERO or ONE
-void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc)
+void sbgemm_scal_operation(BLASLONG M, BLASLONG N, float beta, float *C, BLASLONG ldc)
 {
     float * C_addr0 = C;
     float * C_addr1 = C + ldc;
@@ -1759,12 +1758,6 @@ void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST
     __m512 array_512_0, array_512_1, array_512_2, array_512_3;
     __m512 BETAVECTOR  = _mm512_set1_ps(beta);
 
-    if (Order == CblasRowMajor) {
-        blasint tmp = M;
-        M = N;
-        N = tmp;
-    }
-
     BLASLONG tag_n_Nx = N & (~3);
     BLASLONG tag_n_Mx = M & (~15);
     unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx));
@@ -1828,7 +1821,7 @@ void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST
 }
 
 // Zero C matrix when Beta is 0
-void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, float *C, OPENBLAS_CONST blasint ldc)
+void sbgemm_zero_operation(BLASLONG M, BLASLONG N, float *C, BLASLONG ldc)
 {
     float * C_addr0 = C;
     float * C_addr1 = C + ldc;
@@ -1839,12 +1832,6 @@ void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST
 
     __m512  ZEROVECTOR  = _mm512_setzero_ps();
 
-    if (Order == CblasRowMajor) {
-        blasint tmp = M;
-        M = N;
-        N = tmp;
-    }
-
     BLASLONG tag_n_Nx = N & (~3);
     BLASLONG tag_n_Mx = M & (~15);
     unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx));
diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c
index c71595813..b8ed9838e 100644
--- a/kernel/x86_64/sbgemm_microk_cooperlake_template.c
+++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c
@@ -1,8 +1,6 @@
-#include "sbgemm.h"
 #include "bf16_common_macros.h"
 #include <immintrin.h>
 
-/*  These macros are needed and should be placed at the right place
 #define BF16_BLOCK_STEP_N 8
 #define BF16_BLOCK_THRES_K 1024
 #define BF16_BLOCK_THRES_M 32
@@ -14,7 +12,6 @@
 
 #define ONE  1.e0f
 #define ZERO  0.e0f
-*/
 
 #undef STORE16_COMPLETE_RESULT
 #undef STORE16_MASK_COMPLETE_RESULT
@@ -1798,6 +1795,7 @@ void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha,
 }
 /* ----------------------------------------- End of TT kernels --------------------------------------- */
 
+/*
 #ifndef ONE_ALPHA      // ALPHA is not ONE
 void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc)
@@ -1836,3 +1834,4 @@ void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_
         }
     }
 }
+*/
diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c
new file mode 100644
index 000000000..373457f84
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c
@@ -0,0 +1,2 @@
+#define B0 1
+#include "./sbgemm_small_kernel_nn_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c
new file mode 100644
index 000000000..0b840c248
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c
@@ -0,0 +1,2 @@
+#define B0 1
+#include "./sbgemm_small_kernel_nt_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c
new file mode 100644
index 000000000..67542b69c
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c
@@ -0,0 +1,2 @@
+#define B0 1
+#include "./sbgemm_small_kernel_tn_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c
new file mode 100644
index 000000000..17b5b41c5
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c
@@ -0,0 +1,2 @@
+#define B0 1
+#include "./sbgemm_small_kernel_tt_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c
new file mode 100644
index 000000000..ec40a5054
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c
@@ -0,0 +1,2 @@
+#define TRANS_NN
+#include "sbgemm_small_kernel_template_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c
new file mode 100644
index 000000000..1cdfd2936
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c
@@ -0,0 +1,2 @@
+#define TRANS_NT
+#include "sbgemm_small_kernel_template_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c
new file mode 100644
index 000000000..823aafbdd
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#include "sbgemm_block_microk_cooperlake.c"
+// Define micro kernels for ALPHA not ONE scenarios
+#undef  ONE_ALPHA
+#include "sbgemm_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA as ONE scenarios
+#define ONE_ALPHA 1
+#include "sbgemm_microk_cooperlake_template.c"
+
+int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
+{
+	return 1;
+}
diff --git a/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c
new file mode 100644
index 000000000..d328b0981
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c
@@ -0,0 +1,89 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <memory.h>
+
+extern void sbgemm_scal_operation(BLASLONG M, BLASLONG N, float beta, float *C, BLASLONG ldc);
+extern void sbgemm_zero_operation(BLASLONG M, BLASLONG N, float *C, BLASLONG ldc);
+
+extern void sbgemm_blocking_kernel_nn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_nn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_nt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_nt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_tn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_tn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_tt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+
+#if defined(TRANS_NN)
+#define SBGEMM_BLOCKING_KERNEL_ONE	sbgemm_blocking_kernel_nn_one
+#define SBGEMM_BLOCKING_KERNEL_ALPHA	sbgemm_blocking_kernel_nn_alpha
+#elif defined(TRANS_NT)
+#define SBGEMM_BLOCKING_KERNEL_ONE	sbgemm_blocking_kernel_nt_one
+#define SBGEMM_BLOCKING_KERNEL_ALPHA	sbgemm_blocking_kernel_nt_alpha
+#elif defined(TRANS_TN)
+#define SBGEMM_BLOCKING_KERNEL_ONE	sbgemm_blocking_kernel_tn_one
+#define SBGEMM_BLOCKING_KERNEL_ALPHA	sbgemm_blocking_kernel_tn_alpha
+#elif defined(TRANS_TT)
+#define SBGEMM_BLOCKING_KERNEL_ONE	sbgemm_blocking_kernel_tt_one
+#define SBGEMM_BLOCKING_KERNEL_ALPHA	sbgemm_blocking_kernel_tt_alpha
+#endif
+
+#define BF16_BLOCK_THRES_K 1024
+// If we want to adjust this to be bigger, need to change COL_MAJOR_INCOPY_KERNEL_Kx32 kernel to be bigger also
+#define BF16_BLOCK_THRES_M 32
+#define BF16_BLOCK_THRES_N 1024
+
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	bfloat16 * block_A;
+	bfloat16 * block_B;
+
+	block_A = (bfloat16 *) malloc(sizeof(bfloat16) * BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M);
+	block_B = (bfloat16 *) malloc(sizeof(bfloat16) * BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K);
+
+#if defined(B0)
+	sbgemm_zero_operation(M, N, C, ldc);
+#else
+	sbgemm_scal_operation(M, N, beta, C, ldc);
+#endif
+
+	if (alpha == ONE) {
+		SBGEMM_BLOCKING_KERNEL_ONE(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+	} else {
+		SBGEMM_BLOCKING_KERNEL_ALPHA(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+	}
+	free(block_A);
+	free(block_B);
+	return 0;
+}
diff --git a/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c
new file mode 100644
index 000000000..f1a0d0d0c
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c
@@ -0,0 +1,2 @@
+#define TRANS_TN
+#include "sbgemm_small_kernel_template_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c
new file mode 100644
index 000000000..8a2a597bc
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c
@@ -0,0 +1,2 @@
+#define TRANS_TT
+#include "sbgemm_small_kernel_template_cooperlake.c"

From 2e44ca0136da2829e1c2e65e2cdd4a8d540491a8 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Fri, 13 Aug 2021 00:51:24 +0800
Subject: [PATCH 079/143] sbgemm: add missing cblas_sbgemm definition

---
 cblas.h          | 2 ++
 interface/gemm.c | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cblas.h b/cblas.h
index f0220eb99..a5ad25ad7 100644
--- a/cblas.h
+++ b/cblas.h
@@ -400,6 +400,8 @@ void   cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
 float  cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
 void   cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
 
+void   cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		    OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 #ifdef __cplusplus
 }
 #endif  /* __cplusplus */
diff --git a/interface/gemm.c b/interface/gemm.c
index 47e0ca0c3..6dcc54041 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -273,8 +273,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 	   blasint m, blasint n, blasint k,
 #ifndef COMPLEX
 	   FLOAT alpha,
-	   FLOAT *a, blasint lda,
-	   FLOAT *b, blasint ldb,
+	   IFLOAT *a, blasint lda,
+	   IFLOAT *b, blasint ldb,
 	   FLOAT beta,
 	   FLOAT *c, blasint ldc) {
 #else

From f39301935c27e34acbf95757e644ba6e3ce95cef Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Fri, 13 Aug 2021 18:43:41 +0800
Subject: [PATCH 080/143] sbgemm: cooperlake: make sure hot buffer aligned to
 64

---
 .../sbgemm_small_kernel_template_cooperlake.c     | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c
index d328b0981..1ab7a34ab 100644
--- a/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c
+++ b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c
@@ -59,6 +59,10 @@ extern void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float
 #define BF16_BLOCK_THRES_M 32
 #define BF16_BLOCK_THRES_N 1024
 
+#define MALLOC_ALIGN64(ptr, size, raw_ptr) \
+	raw_ptr = malloc((size) + 63); \
+	ptr = (bfloat16 *)(((uintptr_t) raw_ptr + 63) & ~(uintptr_t)63)
+
 
 #if defined(B0)
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
@@ -68,9 +72,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT al
 {
 	bfloat16 * block_A;
 	bfloat16 * block_B;
+	void* raw_ptrA;
+	void* raw_ptrB;
 
-	block_A = (bfloat16 *) malloc(sizeof(bfloat16) * BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M);
-	block_B = (bfloat16 *) malloc(sizeof(bfloat16) * BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K);
+	MALLOC_ALIGN64(block_A, sizeof(bfloat16) * BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M, raw_ptrA);
+	MALLOC_ALIGN64(block_B, sizeof(bfloat16) * BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K, raw_ptrB);
 
 #if defined(B0)
 	sbgemm_zero_operation(M, N, C, ldc);
@@ -83,7 +89,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT al
 	} else {
 		SBGEMM_BLOCKING_KERNEL_ALPHA(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
 	}
-	free(block_A);
-	free(block_B);
+
+	free(raw_ptrA);
+	free(raw_ptrB);
 	return 0;
 }

From 619588fbabaa0ee470487b9afd063541e95c486b Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Mon, 30 Aug 2021 17:48:11 +0800
Subject: [PATCH 081/143] sbgemm: remove unnecessary b0 files

---
 kernel/Makefile.L3                               | 16 ++++++++--------
 kernel/generic/gemm_small_matrix_kernel_nn.c     |  4 ++--
 kernel/generic/gemm_small_matrix_kernel_nt.c     |  4 ++--
 kernel/generic/gemm_small_matrix_kernel_tn.c     |  4 ++--
 kernel/generic/gemm_small_matrix_kernel_tt.c     |  4 ++--
 kernel/x86_64/KERNEL.COOPERLAKE                  |  8 ++++----
 .../sbgemm_small_kernel_b0_nn_cooperlake.c       |  2 --
 .../sbgemm_small_kernel_b0_nt_cooperlake.c       |  2 --
 .../sbgemm_small_kernel_b0_tn_cooperlake.c       |  2 --
 .../sbgemm_small_kernel_b0_tt_cooperlake.c       |  2 --
 10 files changed, 20 insertions(+), 28 deletions(-)
 delete mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c
 delete mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c
 delete mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c
 delete mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 404f774cc..49b7c78fb 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -4471,32 +4471,32 @@ $(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 
 ifndef SBGEMM_SMALL_K_B0_NN
-SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c
+SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
 endif
 
 ifndef SBGEMM_SMALL_K_B0_NT
-SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c
+SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
 endif
 
 ifndef SBGEMM_SMALL_K_B0_TN
-SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c
+SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
 endif
 
 ifndef SBGEMM_SMALL_K_B0_TT
-SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c
+SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
 endif
 
 $(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 
 $(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 
 $(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 
 $(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 endif
 
 ifndef CGEMM_SMALL_M_PERMIT
diff --git a/kernel/generic/gemm_small_matrix_kernel_nn.c b/kernel/generic/gemm_small_matrix_kernel_nn.c
index b0638c7ea..543e7e047 100644
--- a/kernel/generic/gemm_small_matrix_kernel_nn.c
+++ b/kernel/generic/gemm_small_matrix_kernel_nn.c
@@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #ifdef B0
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
 #else
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
 #endif
 {
 	//naive implemtation
diff --git a/kernel/generic/gemm_small_matrix_kernel_nt.c b/kernel/generic/gemm_small_matrix_kernel_nt.c
index 0a965db58..d4a7aec6a 100644
--- a/kernel/generic/gemm_small_matrix_kernel_nt.c
+++ b/kernel/generic/gemm_small_matrix_kernel_nt.c
@@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #ifdef B0
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
 #else
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
 #endif
 {
 	//naive implemtation
diff --git a/kernel/generic/gemm_small_matrix_kernel_tn.c b/kernel/generic/gemm_small_matrix_kernel_tn.c
index 69ffc718c..2747337f2 100644
--- a/kernel/generic/gemm_small_matrix_kernel_tn.c
+++ b/kernel/generic/gemm_small_matrix_kernel_tn.c
@@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #ifdef B0
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
 #else
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
 #endif
 {
 	//naive implemtation
diff --git a/kernel/generic/gemm_small_matrix_kernel_tt.c b/kernel/generic/gemm_small_matrix_kernel_tt.c
index 9d68de3f9..eec926bc7 100644
--- a/kernel/generic/gemm_small_matrix_kernel_tt.c
+++ b/kernel/generic/gemm_small_matrix_kernel_tt.c
@@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #ifdef B0
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
 #else
-int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
 #endif
 {
 	//naive implemtation
diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE
index 151c02d5a..6272dd73d 100644
--- a/kernel/x86_64/KERNEL.COOPERLAKE
+++ b/kernel/x86_64/KERNEL.COOPERLAKE
@@ -2,10 +2,10 @@ include $(KERNELDIR)/KERNEL.SKYLAKEX
 
 SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c
 SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c
-SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_b0_nn_cooperlake.c
+SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c
 SBGEMM_SMALL_K_NT = sbgemm_small_kernel_nt_cooperlake.c
-SBGEMM_SMALL_K_B0_NT = sbgemm_small_kernel_b0_nt_cooperlake.c
+SBGEMM_SMALL_K_B0_NT = sbgemm_small_kernel_nt_cooperlake.c
 SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c
-SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_b0_tn_cooperlake.c
+SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_tn_cooperlake.c
 SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c
-SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_b0_tt_cooperlake.c
+SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c
diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c
deleted file mode 100644
index 373457f84..000000000
--- a/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define B0 1
-#include "./sbgemm_small_kernel_nn_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c
deleted file mode 100644
index 0b840c248..000000000
--- a/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define B0 1
-#include "./sbgemm_small_kernel_nt_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c
deleted file mode 100644
index 67542b69c..000000000
--- a/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define B0 1
-#include "./sbgemm_small_kernel_tn_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c
deleted file mode 100644
index 17b5b41c5..000000000
--- a/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define B0 1
-#include "./sbgemm_small_kernel_tt_cooperlake.c"

From 2db1a99aca0177761f47daa71b27450923eb127e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 30 Aug 2021 14:21:25 +0200
Subject: [PATCH 082/143] Clean up debug messages

---
 driver/others/memory.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 1f66ef9e9..c560c4e90 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2985,7 +2985,7 @@ void *blas_memory_alloc(int procpos){
 
  error:
  if (memory_overflowed) goto terminate;
- printf("num_buffers exceeded, adding auxiliary array\n");
+  fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n")
   memory_overflowed=1;
   new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
   newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
@@ -3057,9 +3057,9 @@ allocation2:
     UNLOCK_COMMAND(&alloc_lock);
 #endif
 
-//#ifdef DEBUG
+#ifdef DEBUG
     printf("  Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position);
-//#endif
+#endif
 
 #if defined(WHEREAMI) && !defined(USE_OPENMP)
 
@@ -3110,9 +3110,9 @@ void blas_memory_free(void *free_area){
   UNLOCK_COMMAND(&alloc_lock);
 #endif
 
-//#ifdef DEBUG
+#ifdef DEBUG
   printf("Unmap from overflow area succeeded.\n\n");
-//#endif
+#endif
   return;
 } else {
   // arm: ensure all writes are finished before other thread takes this memory

From cd10d1c03be5ecbdf8bda6e448a6cac27f8aa1be Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 30 Aug 2021 14:38:28 +0200
Subject: [PATCH 083/143] Fix typo

---
 driver/others/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index c560c4e90..48067923e 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2985,7 +2985,7 @@ void *blas_memory_alloc(int procpos){
 
  error:
  if (memory_overflowed) goto terminate;
-  fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n")
+  fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
   memory_overflowed=1;
   new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
   newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));

From d1ed72fa87b2c1cdefed4b34682e719a9b326a8c Mon Sep 17 00:00:00 2001
From: Niyas Sait <niyas.sait@linaro.org>
Date: Tue, 24 Aug 2021 06:09:29 +0100
Subject: [PATCH 084/143] [win/arm64]: Explicit casting for GMEMM_DEFAULT_ALIGN
 to create 64-bit value

Win64 uses LLP64 datamodel and unsigned long is only 32-bit. For 64-bit
architecture we need 64-bit mask to correctly generate address
---
 param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/param.h b/param.h
index 634e0ef5d..5250b2f39 100644
--- a/param.h
+++ b/param.h
@@ -2955,7 +2955,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASULONG)0x03fffUL
 
 #define SYMV_P	16
 

From 7cddbf99b1dd9f99203daf9430c5d87f4eac6b56 Mon Sep 17 00:00:00 2001
From: Niyas Sait <niyas.sait@linaro.org>
Date: Tue, 31 Aug 2021 14:36:44 +0100
Subject: [PATCH 085/143] Make explicit conversion condition on _WIN64 flag

---
 param.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/param.h b/param.h
index 5250b2f39..07397a66e 100644
--- a/param.h
+++ b/param.h
@@ -2955,7 +2955,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
+#ifdef _WIN64
+/* Use explicit casting for win64 as LLP64 datamodel is used */
 #define GEMM_DEFAULT_ALIGN (BLASULONG)0x03fffUL
+#else
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#endif
 
 #define SYMV_P	16
 

From f1e33059746c1fc3a4df76f524c1d4f37f9665b4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 1 Sep 2021 21:36:50 +0200
Subject: [PATCH 086/143] Add workaround for Windows10 macro name clash

---
 kernel/Makefile.L3 | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 49b7c78fb..2d274d33b 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -4544,7 +4544,7 @@ $(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@
 
 $(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@
 
 $(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@
@@ -4556,7 +4556,7 @@ $(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
 
 $(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@
 
 $(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@
@@ -4608,7 +4608,7 @@ $(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC -DB0 $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
@@ -4620,7 +4620,7 @@ $(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR -DB0 $< -o $@
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@
 
 $(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
@@ -4680,7 +4680,7 @@ $(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@
 
 $(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@
 
 $(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@
@@ -4692,7 +4692,7 @@ $(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
 
 $(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@
 
 $(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@
@@ -4744,7 +4744,7 @@ $(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC -DB0 $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
@@ -4756,7 +4756,7 @@ $(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR -DB0 $< -o $@
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@
 
 $(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@

From af19cda65aef4d033ae33213013c88b0a99f9da2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 4 Sep 2021 18:26:59 +0200
Subject: [PATCH 087/143] Add "recursive" option for IBM xlf compiler (#3359)

* Add correct "recursive" option for xlf (from reference-lapack issue 606)
---
 Makefile.power | 12 ++++++++++++
 cmake/fc.cmake |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/Makefile.power b/Makefile.power
index 946f55232..4e7478213 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -12,9 +12,13 @@ endif
 ifeq ($(CORE), POWER10)
 ifneq ($(C_COMPILER), PGI)
 CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+ifeq ($(F_COMPILER, IBM)
+FCOMMON_OPT += -O2 -qrecur -qnosave
+else
 FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
 endif
 endif
+endif
 
 ifeq ($(CORE), POWER9)
 ifneq ($(C_COMPILER), PGI)
@@ -33,7 +37,11 @@ else
 CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
 endif
 ifneq ($(F_COMPILER), PGI)
+ifeq ($(F_COMPILER), IBM)
+FCOMMON_OPT += -O2 -qrecur -qnosave
+else
 FCOMMON_OPT += -O2 -frecursive -fno-fast-math
+endif
 ifeq ($(C_COMPILER), GCC)
 ifneq ($(GCCVERSIONGT4), 1)
 $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
@@ -57,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
 endif
 ifneq ($(F_COMPILER), PGI)
 ifeq ($(OSNAME), AIX)
+ifeq ($(F_COMPILER), IBM)
+FCOMMON_OPT += -O2 -qrecur -qnosave
+else
 FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
+endif
 else
 FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
 endif
diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index 631664569..f7aa4c5c9 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -104,7 +104,7 @@ endif ()
 
 if (${F_COMPILER} STREQUAL "IBM")
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM")
-  # FCOMMON_OPT	+= -qarch=440
+  set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur")
   if (BINARY64)
     set(FCOMMON_OPT "${FCOMMON_OPT} -q64")
     if (INTERFACE64)

From 72f3ce5f084c40006e4548ec2a0de2751f5d2dd9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 5 Sep 2021 20:35:48 +0200
Subject: [PATCH 088/143] Add NO_AVX=1 fallbacks to newer generation x86_64 for
 completeness (#3360)

* Add NO_AVX=1 fallbacks to newer generation x86_64 for completeness

* Update .travis.yml
---
 .travis.yml |  2 +-
 getarch.c   | 83 +++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 78 insertions(+), 7 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 8657b64f4..8a3d2e5bb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,6 @@
 # XXX: Precise is already deprecated, new default is Trusty.
 # https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming
-dist: precise
+dist: focal
 sudo: true
 language: c
 
diff --git a/getarch.c b/getarch.c
index 6e43616f7..3b08cbfa9 100644
--- a/getarch.c
+++ b/getarch.c
@@ -313,6 +313,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
+#ifdef NO_AVX 
+#define SUBARCHITECTURE "NEHALEM"
+#define ARCHCONFIG   "-DNEHALEM " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
+#define LIBNAME   "nehalem"
+#define CORENAME  "NEHALEM"
+#else
 #define SUBARCHITECTURE "SANDYBRIDGE"
 #define ARCHCONFIG   "-DSANDYBRIDGE " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -322,12 +332,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LIBNAME   "sandybridge"
 #define CORENAME  "SANDYBRIDGE"
 #endif
+#endif
 
 #ifdef FORCE_HASWELL
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
 #ifdef NO_AVX2
+#ifdef NO_AVX
+#define SUBARCHITECTURE "NEHALEM"
+#define ARCHCONFIG   "-DNEHALEM " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
+#define LIBNAME   "nehalem"
+#define CORENAME  "NEHALEM"
+#else
 #define SUBARCHITECTURE "SANDYBRIDGE"
 #define ARCHCONFIG   "-DSANDYBRIDGE " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -336,6 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
 #define LIBNAME   "sandybridge"
 #define CORENAME  "SANDYBRIDGE"
+#endif
 #else
 #define SUBARCHITECTURE "HASWELL"
 #define ARCHCONFIG   "-DHASWELL " \
@@ -354,6 +376,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
+#ifdef NO_AVX2
+#ifdef NO_AVX
+#define SUBARCHITECTURE "NEHALEM"
+#define ARCHCONFIG   "-DNEHALEM " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
+#define LIBNAME   "nehalem"
+#define CORENAME  "NEHALEM"
+#else
+#define SUBARCHITECTURE "SANDYBRIDGE"
+#define ARCHCONFIG   "-DSANDYBRIDGE " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
+#define LIBNAME   "sandybridge"
+#define CORENAME  "SANDYBRIDGE"
+#endif
+#else
 #define SUBARCHITECTURE "HASWELL"
 #define ARCHCONFIG   "-DHASWELL " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -363,10 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                      "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
+#endif
 #else
-#define FORCE
-#define FORCE_INTEL
-#define ARCHITECTURE    "X86"
 #define SUBARCHITECTURE "SKYLAKEX"
 #define ARCHCONFIG   "-DSKYLAKEX " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -384,6 +425,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
+#ifdef NO_AVX2
+#ifdef NO_AVX
+#define SUBARCHITECTURE "NEHALEM"
+#define ARCHCONFIG   "-DNEHALEM " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
+#define LIBNAME   "nehalem"
+#define CORENAME  "NEHALEM"
+#else
+#define SUBARCHITECTURE "SANDYBRIDGE"
+#define ARCHCONFIG   "-DSANDYBRIDGE " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
+#define LIBNAME   "sandybridge"
+#define CORENAME  "SANDYBRIDGE"
+#endif
+#else
 #define SUBARCHITECTURE "HASWELL"
 #define ARCHCONFIG   "-DHASWELL " \
                      "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -393,10 +455,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                      "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
+#endif
 #else
-#define FORCE
-#define FORCE_INTEL
-#define ARCHITECTURE    "X86"
 #define SUBARCHITECTURE "COOPERLAKE"
 #define ARCHCONFIG   "-DCOOPERLAKE " \
                      "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -564,6 +624,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
 #ifdef NO_AVX2
+#ifdef NO_AVX
+#define SUBARCHITECTURE "NEHALEM"
+#define ARCHCONFIG   "-DNEHALEM " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
+#define LIBNAME   "nehalem"
+#define CORENAME  "NEHALEM"
+#else
 #define SUBARCHITECTURE "SANDYBRIDGE"
 #define ARCHCONFIG   "-DSANDYBRIDGE " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -572,6 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
 #define LIBNAME   "sandybridge"
 #define CORENAME  "SANDYBRIDGE"
+#endif
 #else
 #define SUBARCHITECTURE "ZEN"
 #define ARCHCONFIG   "-DZEN " \

From 32fee860330379774a895a18960640120506d317 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 6 Sep 2021 23:44:20 +0200
Subject: [PATCH 089/143] Correct misplaced ifdef lines

---
 getarch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/getarch.c b/getarch.c
index 3b08cbfa9..094feaadd 100644
--- a/getarch.c
+++ b/getarch.c
@@ -372,10 +372,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef FORCE_SKYLAKEX
-#ifdef NO_AVX512
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
+#ifdef NO_AVX512
 #ifdef NO_AVX2
 #ifdef NO_AVX
 #define SUBARCHITECTURE "NEHALEM"
@@ -421,10 +421,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef FORCE_COOPERLAKE
-#ifdef NO_AVX512
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
+#ifdef NO_AVX512
 #ifdef NO_AVX2
 #ifdef NO_AVX
 #define SUBARCHITECTURE "NEHALEM"

From 349fb4910b7ba2069ffe8374c14b06fcf419f7c6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 7 Sep 2021 11:19:51 +0200
Subject: [PATCH 090/143] Disable the remaining x86_64 job on Travis

---
 .travis.yml | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 8a3d2e5bb..3dc5fe290 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,24 +7,24 @@ language: c
 matrix:
   include:
     - &test-ubuntu
-      os: linux
+#      os: linux
       compiler: gcc
       addons:
         apt:
           packages:
             - gfortran
-      before_script: &common-before
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
-      script:
-        - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
-        - make -C test $COMMON_FLAGS $BTYPE
-        - make -C ctest $COMMON_FLAGS $BTYPE
-        - make -C utest $COMMON_FLAGS $BTYPE
-      env:
-        - TARGET_BOX=LINUX64
-        - BTYPE="BINARY=64"
-
-    - <<: *test-ubuntu
+#      before_script: &common-before
+#        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
+#      script:
+#        - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+#        - make -C test $COMMON_FLAGS $BTYPE
+#        - make -C ctest $COMMON_FLAGS $BTYPE
+#        - make -C utest $COMMON_FLAGS $BTYPE
+#      env:
+#        - TARGET_BOX=LINUX64
+#        - BTYPE="BINARY=64"
+#
+#    - <<: *test-ubuntu
       os: linux-ppc64le
       before_script:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"

From 8c68b6f26d1030f2bb932d8b885cb8d076a84437 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 7 Sep 2021 11:40:40 +0200
Subject: [PATCH 091/143] Update .travis.yml

---
 .travis.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 3dc5fe290..85a57f6e3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -26,8 +26,13 @@ matrix:
 #
 #    - <<: *test-ubuntu
       os: linux-ppc64le
-      before_script:
+      before_script: &common-before
         - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
+      script:
+        - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+        - make -C test $COMMON_FLAGS $BTYPE
+        - make -C ctest $COMMON_FLAGS $BTYPE
+        - make -C utest $COMMON_FLAGS $BTYPE
       env:
         # for matrix annotation only
         - TARGET_BOX=PPC64LE_LINUX

From 4c294336e6bc1b249721c0d9f0ee210d010db9f9 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 10 Aug 2021 03:23:45 +0000
Subject: [PATCH 092/143] sbgemm: cooperlake: add dummy source files

---
 kernel/x86_64/KERNEL.COOPERLAKE               | 11 +++++++
 kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c | 32 ++++++++++++++++++
 kernel/x86_64/sbgemm_ncopy_32_cooperlake.c    | 33 +++++++++++++++++++
 kernel/x86_64/sbgemm_ncopy_8_cooperlake.c     | 33 +++++++++++++++++++
 kernel/x86_64/sbgemm_tcopy_32_cooperlake.c    | 33 +++++++++++++++++++
 kernel/x86_64/sbgemm_tcopy_8_cooperlake.c     | 33 +++++++++++++++++++
 kernel/x86_64/sgemm_beta_skylakex.c           |  2 +-
 7 files changed, 176 insertions(+), 1 deletion(-)
 create mode 100644 kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_ncopy_32_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_ncopy_8_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_tcopy_32_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemm_tcopy_8_cooperlake.c

diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE
index 6272dd73d..197907261 100644
--- a/kernel/x86_64/KERNEL.COOPERLAKE
+++ b/kernel/x86_64/KERNEL.COOPERLAKE
@@ -9,3 +9,14 @@ SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c
 SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_tn_cooperlake.c
 SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c
 SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c
+
+SBGEMM_BETA     = sgemm_beta_skylakex.c
+SBGEMMKERNEL    = sbgemm_kernel_32x8_cooperlake.c
+SBGEMMINCOPY    = sbgemm_ncopy_32_cooperlake.c
+SBGEMMITCOPY    = sbgemm_tcopy_32_cooperlake.c
+SBGEMMONCOPY    = sbgemm_ncopy_8_cooperlake.c
+SBGEMMOTCOPY    = sbgemm_tcopy_8_cooperlake.c
+SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
+SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c b/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c
new file mode 100644
index 000000000..ea2600067
--- /dev/null
+++ b/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c
@@ -0,0 +1,32 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
+{
+}
diff --git a/kernel/x86_64/sbgemm_ncopy_32_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_32_cooperlake.c
new file mode 100644
index 000000000..afcf6f647
--- /dev/null
+++ b/kernel/x86_64/sbgemm_ncopy_32_cooperlake.c
@@ -0,0 +1,33 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+}
diff --git a/kernel/x86_64/sbgemm_ncopy_8_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_8_cooperlake.c
new file mode 100644
index 000000000..afcf6f647
--- /dev/null
+++ b/kernel/x86_64/sbgemm_ncopy_8_cooperlake.c
@@ -0,0 +1,33 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+}
diff --git a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c
new file mode 100644
index 000000000..afcf6f647
--- /dev/null
+++ b/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c
@@ -0,0 +1,33 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+}
diff --git a/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c
new file mode 100644
index 000000000..afcf6f647
--- /dev/null
+++ b/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c
@@ -0,0 +1,33 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+}
diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c
index 1c29c1168..6217acf48 100644
--- a/kernel/x86_64/sgemm_beta_skylakex.c
+++ b/kernel/x86_64/sgemm_beta_skylakex.c
@@ -41,7 +41,7 @@
 #include <immintrin.h>
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
-	  FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
+	  IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
 	  FLOAT *c, BLASLONG ldc){
 
   BLASLONG i, j;

From ef8f5fecc8f532081eb63ded20da650b57e78e54 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 10 Aug 2021 06:14:45 +0000
Subject: [PATCH 093/143] sbgemm: cooperlake: implement sbgemm_tcopy_32

---
 kernel/x86_64/sbgemm_tcopy_32_cooperlake.c | 108 +++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c
index afcf6f647..3e37473ca 100644
--- a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c
+++ b/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c
@@ -26,8 +26,116 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include <stdio.h>
+#include <immintrin.h>
 #include "common.h"
 
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+  BLASLONG i, j;
 
+  IFLOAT *boffset;
+
+  boffset   = b;
+
+  BLASLONG n32 = n & ~31;
+  BLASLONG m4 = m & ~3;
+  BLASLONG m2 = m & ~1;
+
+  uint32_t permute_table = {
+    0, 0x10|0, 1, 0x10|1, 2, 0x10|2, 3, 0x10|3, 4, 0x10|4, 5, 0x10|5, 6, 0x10|6, 7, 0x10, 7,
+    8, 0x10|8, 9, 0x10|9, 10, 0x10|10, 11, 0x10|11, 12, 0x10|12, 13, 0x10|13, 14, 0x10|14, 15, 0x10|15,
+  };
+
+  __m512i idx_lo = _mm512_loadu_si512(permute_table);
+  __m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
+
+  for (j = 0; j < n32; j += 32) {
+    for (i = 0; i < m4; i += 4) {
+      /* bf16 fma need special memory layout:
+       * for memory layout like below:
+       *     a00, a01, a02, a03, a04, a05 ....
+       *     a10, a11, a12, a13, a14, a15 ....
+       * need to copy as:
+       *     a00, a10, a01, a11, a02, a12, a03, a13, ...
+       */
+      __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
+      __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]);
+      __m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]);
+      __m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]);
+
+      __m512i a00 = _mm512_unpacklo_epi16(a0, a1);
+      __m512i a01 = _mm512_unpackhi_epi16(a0, a1);
+      __m512i a10 = _mm512_unpacklo_epi16(a2, a3);
+      __m512i a11 = _mm512_unpackhi_epi16(a2, a3);
+
+      a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
+      a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
+      a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11);
+      a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11);
+
+      _mm512_storeu_si512(boffset, a0);
+      _mm512_storeu_si512(boffset + 32, a1);
+      _mm512_storeu_si512(boffset + 64, a2);
+      _mm512_storeu_si512(boffset + 96, a3);
+      boffset += 128;
+    }
+    for (; i < m2; i += 2) {
+      __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
+      __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]);
+
+      __m512i a00 = _mm512_unpacklo_epi16(a0, a1);
+      __m512i a01 = _mm512_unpackhi_epi16(a0, a1);
+
+      a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
+      a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
+
+      _mm512_storeu_si512(boffset, a0);
+      _mm512_storeu_si512(boffset + 32, a1);
+      boffset += 64;
+    }
+    for (; i < m; i++) {
+      /* just copy the only remains row */
+      __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
+      _mm512_storeu_si512(boffset, a0);
+      boffset += 32;
+    }
+  }
+  if (j < n) {
+    uint32_t remains = n - j;
+    __mmask32 r_mask = (1UL << remains) - 1;
+    if (remains > 16) {
+      __mmask16 w_mask = (1UL << (remains - 16)) - 1;
+      for (i = 0; i < m2; i += 2) {
+        __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+        __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
+
+        __m512i a00 = _mm512_unpacklo_epi16(a0, a1);
+        __m512i a01 = _mm512_unpackhi_epi16(a0, a1);
+
+        a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
+        a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
+
+        _mm512_storeu_si512(boffset, a0);
+        _mm512_mask_storeu_epi32(boffset + 32, w_mask, a1);
+        boffset += 2 * remains;
+      }
+    } else {
+      __mmask16 w_mask = (1UL << remains ) - 1;
+      for (i = 0; i < m2; i += 2) {
+        __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+        __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
+
+        __m512i a00 = _mm512_unpacklo_epi16(a0, a1);
+        __m512i a01 = _mm512_unpackhi_epi16(a0, a1);
+
+        a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
+        _mm512_mask_storeu_epi32(boffset, w_mask, a0);
+        boffset += 2 * remains;
+      }
+    }
+    for (; i < m; i++) {
+        __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+        _mm512_mask_storeu_epi16(boffset, r_mask, a0);
+        boffset += remains;
+    }
+  }
 }

From 2ec9f3a8aa67e7b36612bc8faf34397e2a968b27 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 12 Aug 2021 01:46:49 +0000
Subject: [PATCH 094/143] sbgemm: cooperlake: change kernel size to 16x4

---
 kernel/x86_64/KERNEL.COOPERLAKE               |  10 +-
 kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 126 +++++++++++
 kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c |  32 ---
 ...perlake.c => sbgemm_ncopy_16_cooperlake.c} |   0
 kernel/x86_64/sbgemm_ncopy_4_cooperlake.c     | 207 ++++++++++++++++++
 ...perlake.c => sbgemm_tcopy_16_cooperlake.c} |  73 +++---
 ...operlake.c => sbgemm_tcopy_4_cooperlake.c} |   0
 kernel/x86_64/sbgemm_tcopy_8_cooperlake.c     |  33 ---
 8 files changed, 385 insertions(+), 96 deletions(-)
 create mode 100644 kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
 delete mode 100644 kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c
 rename kernel/x86_64/{sbgemm_ncopy_32_cooperlake.c => sbgemm_ncopy_16_cooperlake.c} (100%)
 create mode 100644 kernel/x86_64/sbgemm_ncopy_4_cooperlake.c
 rename kernel/x86_64/{sbgemm_tcopy_32_cooperlake.c => sbgemm_tcopy_16_cooperlake.c} (71%)
 rename kernel/x86_64/{sbgemm_ncopy_8_cooperlake.c => sbgemm_tcopy_4_cooperlake.c} (100%)
 delete mode 100644 kernel/x86_64/sbgemm_tcopy_8_cooperlake.c

diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE
index 197907261..dba94aea8 100644
--- a/kernel/x86_64/KERNEL.COOPERLAKE
+++ b/kernel/x86_64/KERNEL.COOPERLAKE
@@ -11,11 +11,11 @@ SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c
 SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c
 
 SBGEMM_BETA     = sgemm_beta_skylakex.c
-SBGEMMKERNEL    = sbgemm_kernel_32x8_cooperlake.c
-SBGEMMINCOPY    = sbgemm_ncopy_32_cooperlake.c
-SBGEMMITCOPY    = sbgemm_tcopy_32_cooperlake.c
-SBGEMMONCOPY    = sbgemm_ncopy_8_cooperlake.c
-SBGEMMOTCOPY    = sbgemm_tcopy_8_cooperlake.c
+SBGEMMKERNEL    = sbgemm_kernel_16x4_cooperlake.c
+SBGEMMINCOPY    = sbgemm_ncopy_16_cooperlake.c
+SBGEMMITCOPY    = sbgemm_tcopy_16_cooperlake.c
+SBGEMMONCOPY    = sbgemm_ncopy_4_cooperlake.c
+SBGEMMOTCOPY    = sbgemm_tcopy_4_cooperlake.c
 SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
 SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
new file mode 100644
index 000000000..05ba015d2
--- /dev/null
+++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
@@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+
+#define VMOVLDUP(addr, zmm) asm("vmovsldup (%1), %0": "=v"(zmm): "r"(addr))
+#define VMOVHDUP(addr, zmm) asm("vmovshdup (%1), %0": "=v"(zmm): "r"(addr))
+#define BROADCAST64(base, step, n, offset, zmm) \
+	if (n == 0) asm("vbroadcastsd %2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \
+	else asm("vbroadcastsd %4(%1, %2, %3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2))
+
+#define DECLARE_A_PAIR(A) \
+	__m512i A_lo_##A; __m512i A_hi_##A;
+
+#define LOAD_A_PAIR(A) \
+	VMOVLDUP(ptr_a##A, A_lo_##A); \
+	VMOVHDUP(ptr_a##A, A_hi_##A);
+
+#define LOAD_A_PAIR_TAIL(A) { \
+	__m256i ymm = _mm256_loadu_si256(ptr_a##A); \
+	__m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \
+	A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \
+	A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \
+}
+
+#define DECLARE_B_PAIR() \
+	__m512i B_lo; __m512i B_hi;
+
+#define BROADCAST_B_PAIR(Bx, By) \
+	BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \
+	BROADCAST64(ptr_b##Bx, n_blksize, By, 2, B_hi);
+
+#define BROADCAST_B_PAIR_TAIL(Bx, By) {\
+	__m128i xmm = (__m128i) _mm_load_sd(ptr_b##Bx + n_blksize * By); \
+	xmm = _mm_cvtepu16_epi32(xmm); \
+	B_lo = _mm512_broadcastd_epi32(xmm); \
+	B_hi = _mm512_broadcastd_epi32((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \
+}
+
+#define DECLARE_RESULT_4X(A, Bx, By) \
+	__m512 result_00_##A##Bx##By = _mm512_setzero_ps(); \
+	__m512 result_01_##A##Bx##By = _mm512_setzero_ps(); \
+	__m512 result_10_##A##Bx##By = _mm512_setzero_ps(); \
+	__m512 result_11_##A##Bx##By = _mm512_setzero_ps();
+
+#define FMA(a, b, r) r = _mm512_dpbf16_ps(r, (__m512bh)a, (__m512bh)b)
+
+#define MATMUL_4X(A, Bx, By) \
+	FMA(A_lo_##A, B_lo, result_00_##A##Bx##By); \
+	FMA(A_hi_##A, B_lo, result_01_##A##Bx##By); \
+	FMA(A_lo_##A, B_hi, result_10_##A##Bx##By); \
+	FMA(A_hi_##A, B_hi, result_11_##A##Bx##By);
+
+#define STORE_4X(A, Bx, By)
+
+
+
+int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
+{
+	IFLOAT *ptr_a = A, *ptr_b = B, *ptr_c = C;
+	IFLOAT *ptr_b0, *ptr_b1;
+	IFLOAT *ptr_a0, *ptr_a1;
+	BLASLONG n_count = n;
+	BLASLONG m_count, k_count;
+	BLASLONG n_blksize = 4 * k;
+
+	for (; n_count > 23; n_count -= 24) {
+		m_count = m;
+		ptr_b0 = ptr_b;
+		ptr_b1 = ptr_b0 + n_blksize * 3;
+		for (; m_count > 15; m_count -= 16) {
+			DECLARE_A_PAIR(0); DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
+			DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				LOAD_A_PAIR(0);
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
+				ptr_b0 += 24 * 2;
+				ptr_b1 += 24 * 2;
+				ptr_a0 += 16 * 2;
+			}
+			if (k_count > 0) {
+				LOAD_A_PAIR_TAIL(0);
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
+				BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2);
+				ptr_b0 += 24;
+				ptr_b1 += 24;
+				ptr_a0 += 16;
+			}
+		}
+	}
+}
diff --git a/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c b/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c
deleted file mode 100644
index ea2600067..000000000
--- a/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/***************************************************************************
-Copyright (c) 2021, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
-{
-}
diff --git a/kernel/x86_64/sbgemm_ncopy_32_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c
similarity index 100%
rename from kernel/x86_64/sbgemm_ncopy_32_cooperlake.c
rename to kernel/x86_64/sbgemm_ncopy_16_cooperlake.c
diff --git a/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c
new file mode 100644
index 000000000..523e3b48f
--- /dev/null
+++ b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c
@@ -0,0 +1,207 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <immintrin.h>
+#include "common.h"
+
+#define REORDER_4x32(r0, r1, r2, r3) {\
+	__m512i t0, t1, t2, t3; \
+	t0 = _mm512_unpacklo_epi32(r0, r1); \
+	t1 = _mm512_unpackhi_epi32(r0, r1); \
+	t2 = _mm512_unpacklo_epi32(r2, r3); \
+	t3 = _mm512_unpackhi_epi32(r2, r3); \
+	r0 = _mm512_unpacklo_epi64(t0, t2); \
+	r1 = _mm512_unpackhi_epi64(t0, t2); \
+	r2 = _mm512_unpacklo_epi64(t1, t3); \
+	r3 = _mm512_unpackhi_epi64(t1, t3); \
+	t0 = _mm512_permutex2var_epi32(r0, idx_lo_128, r1); \
+	t1 = _mm512_permutex2var_epi32(r0, idx_hi_128, r1); \
+	t2 = _mm512_permutex2var_epi32(r2, idx_lo_128, r3); \
+	t3 = _mm512_permutex2var_epi32(r2, idx_hi_128, r3); \
+	r0 = _mm512_permutex2var_epi32(t0, idx_lo_256, t2); \
+	r1 = _mm512_permutex2var_epi32(t1, idx_lo_256, t3); \
+	r2 = _mm512_permutex2var_epi32(t0, idx_hi_256, t2); \
+	r3 = _mm512_permutex2var_epi32(t1, idx_hi_256, t3); \
+}
+
+#define REORDER_4x8(r0, r1, r2, r3) {\
+	__m128i t0, t1, t2, t3; \
+	t0 = _mm_unpacklo_epi32(r0, r1); \
+	t1 = _mm_unpackhi_epi32(r0, r1); \
+	t2 = _mm_unpacklo_epi32(r2, r3); \
+	t3 = _mm_unpackhi_epi32(r2, r3); \
+	r0 = _mm_unpacklo_epi64(t0, t2); \
+	r1 = _mm_unpackhi_epi64(t0, t2); \
+	r2 = _mm_unpacklo_epi64(t1, t3); \
+	r3 = _mm_unpackhi_epi64(t1, t3); \
+}
+
+#define GET_TAIL(tail, remain_m) \
+	switch((remain_m + 1)/2) { \
+		case 1: tail = r0; break; \
+		case 2: tail = r1; break; \
+		case 3: tail = r2; break; \
+		case 4: tail = r3; break; \
+	}
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+	BLASLONG i, j;
+	IFLOAT *aoffset;
+	IFLOAT *aoffset0, *aoffset1, *aoffset2, *aoffset3;
+
+	IFLOAT *boffset;
+
+	aoffset = a;
+	boffset = b;
+
+	BLASLONG m32 = n & ~31;
+	BLASLONG m8 = n & ~7;
+	BLASLONG n4 = n & ~3;
+
+	int permute_table[] = {
+		0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
+		0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
+		0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	};
+	__m512i idx_lo_128 = _mm512_loadu_si512(permute_table);
+	__m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16);
+	__m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32);
+	__m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48);
+
+	for (j = 0; j < n4; j += 4) {
+		aoffset0  = aoffset;
+		aoffset1  = aoffset0 + lda;
+		aoffset2  = aoffset1 + lda;
+		aoffset3  = aoffset2 + lda;
+		aoffset += 4 * lda;
+
+		for (i = 0; i < m32; i += 32) {
+			__m512i r0, r1, r2, r3;
+			r0 = _mm512_loadu_si512(aoffset0 + i);
+			r1 = _mm512_loadu_si512(aoffset1 + i);
+			r2 = _mm512_loadu_si512(aoffset2 + i);
+			r3 = _mm512_loadu_si512(aoffset3 + i);
+			REORDER_4x32(r0, r1, r2, r3);
+			_mm512_storeu_si512(boffset + 32*0, r0);
+			_mm512_storeu_si512(boffset + 32*1, r1);
+			_mm512_storeu_si512(boffset + 32*2, r2);
+			_mm512_storeu_si512(boffset + 32*3, r3);
+			boffset += 32 * 4;
+		}
+		for (; i < m8; i += 8) {
+			__m128i r0 = _mm_loadu_si128(aoffset0 + i);
+			__m128i r1 = _mm_loadu_si128(aoffset1 + i);
+			__m128i r2 = _mm_loadu_si128(aoffset2 + i);
+			__m128i r3 = _mm_loadu_si128(aoffset3 + i);
+			REORDER_4x8(r0, r1, r2, r3);
+			_mm_storeu_si128(boffset + 8*0, r0);
+			_mm_storeu_si128(boffset + 8*1, r1);
+			_mm_storeu_si128(boffset + 8*2, r2);
+			_mm_storeu_si128(boffset + 8*3, r3);
+			boffset += 8 * 4;
+		}
+		if (i < m) {
+			int remain_m = m - i;
+			__mmask8 r_mask = (1UL << remain_m) - 1;
+			__m128i r0 = _mm_maskz_loadu_epi16(r_mask, aoffset0 + i);
+			__m128i r1 = _mm_maskz_loadu_epi16(r_mask, aoffset1 + i);
+			__m128i r2 = _mm_maskz_loadu_epi16(r_mask, aoffset2 + i);
+			__m128i r3 = _mm_maskz_loadu_epi16(r_mask, aoffset3 + i);
+			REORDER_4x8(r0, r1, r2, r3);
+
+			// store should skip the tail odd line
+			int num_store = remain_m/2;
+			switch(num_store) {
+				case 3: _mm_storeu_si128(boffset + 8*2, r0);
+				case 2: _mm_storeu_si128(boffset + 8*1, r0);
+				case 1: _mm_storeu_si128(boffset + 8*0, r0);
+			}
+			boffset += 8 * num_store;
+
+			if (m & 0x1) { // handling the tail
+				__m128i tail;
+				GET_TAIL(tail, remain_m);
+				/* tail vector is fill with zero like:
+				 *     a, 0, b, 0, c, 0, d, 0
+				 * need to extract lo words of data and store
+				 */
+				tail = _mm_cvtepi32_epi16(tail);
+				_mm_store_sd(boffset, (__m128d) tail); // only lower 4 bfloat valid
+				boffset += 4;
+			}
+		}
+	}
+	if (j < n) {
+		int remain_n = n - j;
+		__mmask8 nmask = (1UL << remain_n) - 1;
+		aoffset0  = aoffset;
+		aoffset1  = aoffset0 + lda;
+		aoffset2  = aoffset1 + lda;
+		aoffset3  = aoffset2 + lda;
+		__m128i r0, r1, r2, r3;
+		for (i = 0; i < m8; i += 8) {
+			switch (remain_n) {
+				case 3: r2 = _mm_loadu_si128(aoffset2 + i);
+				case 2: r1 = _mm_loadu_si128(aoffset1 + i);
+				case 1: r0 = _mm_loadu_si128(aoffset0 + i);
+			}
+			REORDER_4x8(r0, r1, r2, r3);
+			_mm_mask_storeu_epi16(boffset + remain_n * 0, nmask, r0);
+			_mm_mask_storeu_epi16(boffset + remain_n * 1, nmask, r1);
+			_mm_mask_storeu_epi16(boffset + remain_n * 2, nmask, r2);
+			_mm_mask_storeu_epi16(boffset + remain_n * 3, nmask, r3);
+			boffset += 4 * remain_n;
+		}
+		if (i < m) {
+			int remain_m = m - i;
+			__mmask8 mmask = (1UL << remain_m) - 1;
+			switch (remain_n) {
+				case 3: r2 = _mm_maskz_loadu_epi16(mmask, aoffset2 + i);
+				case 2: r1 = _mm_maskz_loadu_epi16(mmask, aoffset1 + i);
+				case 1: r0 = _mm_maskz_loadu_epi16(mmask, aoffset0 + i);
+			}
+			REORDER_4x8(r0, r1, r2, r3);
+
+			int num_store = remain_m/2;
+			switch (num_store) {
+				case 3: _mm_mask_storeu_epi16(boffset + remain_n * 2, nmask, r2);
+				case 2: _mm_mask_storeu_epi16(boffset + remain_n * 1, nmask, r1);
+				case 1: _mm_mask_storeu_epi16(boffset + remain_n * 0, nmask, r0);
+			}
+			boffset += 2 * num_store * remain_n;
+
+			if (m & 0x1) {
+				__m128i tail;
+				GET_TAIL(tail, remain_m);
+				tail = _mm_cvtepi32_epi16(tail);
+				_mm_mask_storeu_epi16(boffset, nmask, tail);
+			}
+		}
+	}
+}
diff --git a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c
similarity index 71%
rename from kernel/x86_64/sbgemm_tcopy_32_cooperlake.c
rename to kernel/x86_64/sbgemm_tcopy_16_cooperlake.c
index 3e37473ca..16bf48f0b 100644
--- a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c
+++ b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c
@@ -32,23 +32,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
   BLASLONG i, j;
 
-  IFLOAT *boffset;
+  IFLOAT *boffset0, *boffset1;
 
-  boffset   = b;
+  boffset0   = b;
 
   BLASLONG n32 = n & ~31;
   BLASLONG m4 = m & ~3;
   BLASLONG m2 = m & ~1;
 
   uint32_t permute_table = {
-    0, 0x10|0, 1, 0x10|1, 2, 0x10|2, 3, 0x10|3, 4, 0x10|4, 5, 0x10|5, 6, 0x10|6, 7, 0x10, 7,
-    8, 0x10|8, 9, 0x10|9, 10, 0x10|10, 11, 0x10|11, 12, 0x10|12, 13, 0x10|13, 14, 0x10|14, 15, 0x10|15,
+    0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17,
+    0x08, 0x18, 0x09, 0x19, 0x0a, 0x1a, 0x0b, 0x1b, 0x0c, 0x1c, 0x0d, 0x1d, 0x0e, 0x1e, 0x0f, 0x1f,
   };
 
   __m512i idx_lo = _mm512_loadu_si512(permute_table);
   __m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
 
   for (j = 0; j < n32; j += 32) {
+    /* process 2x16 n at the same time */
+    boffset1 = boffset0 + m * 16;
     for (i = 0; i < m4; i += 4) {
       /* bf16 fma need special memory layout:
        * for memory layout like below:
@@ -72,11 +74,12 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
       a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11);
       a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11);
 
-      _mm512_storeu_si512(boffset, a0);
-      _mm512_storeu_si512(boffset + 32, a1);
-      _mm512_storeu_si512(boffset + 64, a2);
-      _mm512_storeu_si512(boffset + 96, a3);
-      boffset += 128;
+      _mm512_storeu_si512(boffset0, a0);
+      _mm512_storeu_si512(boffset1, a1);
+      _mm512_storeu_si512(boffset0 + 32, a2);
+      _mm512_storeu_si512(boffset1 + 32, a3);
+      boffset0 += 64;
+      boffset1 += 64;
     }
     for (; i < m2; i += 2) {
       __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
@@ -88,22 +91,29 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
       a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
       a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
 
-      _mm512_storeu_si512(boffset, a0);
-      _mm512_storeu_si512(boffset + 32, a1);
-      boffset += 64;
+      _mm512_storeu_si512(boffset0, a0);
+      _mm512_storeu_si512(boffset1, a1);
+      boffset0 += 32;
+      boffset1 += 32;
     }
     for (; i < m; i++) {
       /* just copy the only remains row */
-      __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
-      _mm512_storeu_si512(boffset, a0);
-      boffset += 32;
+      __m256i a0 = _mm256_loadu_si256(&a[(i + 0)*lda + j]);
+      __m256i a1 = _mm256_loadu_si256(&a[(i + 0)*lda + j + 16]);
+      _mm256_storeu_si256(boffset0, a0);
+      _mm256_storeu_si256(boffset1, a1);
+      boffset0 += 16;
+      boffset1 += 16;
     }
+    boffset0 = boffset1;
   }
   if (j < n) {
     uint32_t remains = n - j;
     __mmask32 r_mask = (1UL << remains) - 1;
     if (remains > 16) {
-      __mmask16 w_mask = (1UL << (remains - 16)) - 1;
+      boffset1 = boffset0 + m * 16;
+      uint32_t tail1 = remains - 16;
+      __mmask16 w_mask1 = (1UL << tail1) - 1;
       for (i = 0; i < m2; i += 2) {
         __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
         __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
@@ -114,9 +124,19 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
         a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
 
-        _mm512_storeu_si512(boffset, a0);
-        _mm512_mask_storeu_epi32(boffset + 32, w_mask, a1);
-        boffset += 2 * remains;
+        _mm512_storeu_si512(boffset0, a0);
+        _mm512_mask_storeu_epi32(boffset1, w_mask1, a1);
+
+        boffset0 += 32;
+        boffset1 += 2 * tail1;
+      }
+      for (; i < m; i++) {
+        __m256i a0 = _mm256_loadu_si256(&a[(i + 0)*lda + j]);
+        __m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, &a[(i + 0)*lda + j + 16]);
+        _mm256_storeu_si256(boffset0, a0);
+        _mm256_mask_storeu_epi16(boffset1, w_mask1, a1);
+        boffset0 += 16;
+        boffset1 += tail1;
       }
     } else {
       __mmask16 w_mask = (1UL << remains ) - 1;
@@ -128,14 +148,15 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         __m512i a01 = _mm512_unpackhi_epi16(a0, a1);
 
         a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
-        _mm512_mask_storeu_epi32(boffset, w_mask, a0);
-        boffset += 2 * remains;
+
+        _mm512_mask_storeu_epi32(boffset0, w_mask, a0);
+        boffset0 += 2 * remains;
+      }
+      for (; i < m; i++) {
+        __m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]);
+        _mm256_mask_storeu_epi16(boffset0, w_mask, a0);
+        boffset0 += remains;
       }
-    }
-    for (; i < m; i++) {
-        __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
-        _mm512_mask_storeu_epi16(boffset, r_mask, a0);
-        boffset += remains;
     }
   }
 }
diff --git a/kernel/x86_64/sbgemm_ncopy_8_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c
similarity index 100%
rename from kernel/x86_64/sbgemm_ncopy_8_cooperlake.c
rename to kernel/x86_64/sbgemm_tcopy_4_cooperlake.c
diff --git a/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c
deleted file mode 100644
index afcf6f647..000000000
--- a/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/***************************************************************************
-Copyright (c) 2021, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-}

From 9df0953cde0833644155eb6f22d241fc773504a8 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Mon, 16 Aug 2021 19:39:24 +0800
Subject: [PATCH 095/143] sbgemm: cooperlake: kernel works for NN

---
 kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 375 +++++++++++++++++-
 kernel/x86_64/sbgemm_ncopy_4_cooperlake.c     |  51 +--
 kernel/x86_64/sbgemm_tcopy_16_cooperlake.c    | 259 ++++++------
 3 files changed, 515 insertions(+), 170 deletions(-)

diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
index 05ba015d2..d604235c9 100644
--- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
+++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
@@ -31,8 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VMOVLDUP(addr, zmm) asm("vmovsldup (%1), %0": "=v"(zmm): "r"(addr))
 #define VMOVHDUP(addr, zmm) asm("vmovshdup (%1), %0": "=v"(zmm): "r"(addr))
 #define BROADCAST64(base, step, n, offset, zmm) \
-	if (n == 0) asm("vbroadcastsd %2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \
-	else asm("vbroadcastsd %4(%1, %2, %3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2))
+	if (n == 0) asm("vbroadcastsd %c2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \
+	else asm("vbroadcastsd %c4(%1, %2, %c3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2))
 
 #define DECLARE_A_PAIR(A) \
 	__m512i A_lo_##A; __m512i A_hi_##A;
@@ -41,8 +41,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	VMOVLDUP(ptr_a##A, A_lo_##A); \
 	VMOVHDUP(ptr_a##A, A_hi_##A);
 
+#define MASK_LOAD_A_PAIR(A) { \
+	__m512 tmp = _mm512_maskz_loadu_ps(mmask, ptr_a##A); \
+	A_lo_##A = (__m512i) _mm512_moveldup_ps(tmp); \
+	A_hi_##A = (__m512i) _mm512_movehdup_ps(tmp); \
+}
+
 #define LOAD_A_PAIR_TAIL(A) { \
-	__m256i ymm = _mm256_loadu_si256(ptr_a##A); \
+	__m256i ymm = _mm256_loadu_si256((void *)ptr_a##A); \
+	__m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \
+	A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \
+	A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \
+}
+
+#define MASK_LOAD_A_PAIR_TAIL(A) { \
+	__m256i ymm = _mm256_maskz_loadu_epi16(mmask, ptr_a##A); \
 	__m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \
 	A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \
 	A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \
@@ -53,13 +66,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define BROADCAST_B_PAIR(Bx, By) \
 	BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \
-	BROADCAST64(ptr_b##Bx, n_blksize, By, 2, B_hi);
+	BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi);
+
+#define MASK_BROADCAST_B_PAIR(Bx, x) {\
+	__m128 xmm = _mm_maskz_loadu_ps(nmask, ptr_b##Bx); \
+	B_lo = (__m512i) _mm512_broadcastsd_pd((__m128d) xmm); \
+	B_hi = (__m512i) _mm512_broadcastsd_pd(_mm_permute_pd((__m128d) xmm, 0x1)); \
+}
 
 #define BROADCAST_B_PAIR_TAIL(Bx, By) {\
-	__m128i xmm = (__m128i) _mm_load_sd(ptr_b##Bx + n_blksize * By); \
+	__m128i xmm = (__m128i) _mm_load_sd((double *)(ptr_b##Bx + n_blksize * By)); \
+	xmm = _mm_cvtepu16_epi32(xmm); \
+	B_lo = _mm512_broadcast_i32x2(xmm); \
+	B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \
+}
+
+#define MASK_BROADCAST_B_PAIR_TAIL(Bx, By) {\
+	__m128i xmm = _mm_maskz_loadu_epi16(nmask, ptr_b##Bx + n_blksize * By); \
 	xmm = _mm_cvtepu16_epi32(xmm); \
-	B_lo = _mm512_broadcastd_epi32(xmm); \
-	B_hi = _mm512_broadcastd_epi32((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \
+	B_lo = _mm512_broadcast_i32x2(xmm); \
+	B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \
 }
 
 #define DECLARE_RESULT_4X(A, Bx, By) \
@@ -76,25 +102,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	FMA(A_lo_##A, B_hi, result_10_##A##Bx##By); \
 	FMA(A_hi_##A, B_hi, result_11_##A##Bx##By);
 
-#define STORE_4X(A, Bx, By)
+#define _STORE_C_2nx16(addr, val0, val1) \
+	asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \
+	asm("vfmadd213ps (%1, %3, 4), %2, %0": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc)); \
+	asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); \
+	asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc))
+
+#define _MASK_STORE_C_2nx16(addr, val0, val1) \
+	asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \
+	asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "k"(mmask)); \
+	asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); \
+	asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "k"(mmask))
+
+#define _REORDER_C_2X(result_0, result_1) { \
+	__m512 tmp0, tmp1; \
+	tmp0 = _mm512_unpacklo_ps(result_0, result_1); \
+	tmp1 = _mm512_unpackhi_ps(result_0, result_1); \
+	result_0 = (__m512) _mm512_unpacklo_pd((__m512d) tmp0, (__m512d) tmp1); \
+	result_1 = (__m512) _mm512_unpackhi_pd((__m512d) tmp0, (__m512d) tmp1); \
+}
+
+#define _STORE_2X(ptr_c, result_0, result_1) {\
+	_REORDER_C_2X(result_0, result_1) \
+	_STORE_C_2nx16(ptr_c, result_0, result_1); \
+	ptr_c += ldc * 2; \
+}
+
+#define _MASK_STORE_2X(ptr_c, result_0, result_1) {\
+	_REORDER_C_2X(result_0, result_1) \
+	_MASK_STORE_C_2nx16(ptr_c, result_0, result_1); \
+	ptr_c += ldc * 2; \
+}
+
+#define STORE_4X(A, Bx, By) { \
+	_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \
+	_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \
+}
+
+#define MASK_STORE_4X(A, Bx, By) { \
+	_MASK_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \
+	_MASK_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \
+}
+
+#define _STORE_C_16(addr, val0) \
+	asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \
+	asm("vmovups %0, (%1)": : "v"(val0), "r"(addr));
 
+#define _MASK_STORE_C_16(addr, val0) \
+	asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \
+	asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask));
+
+#define N_STORE_4X(A, Bx, By) { \
+	_REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \
+	_REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \
+	switch(n_count) { \
+		case 3: _STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \
+		case 2: _STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \
+		case 1: _STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \
+	} \
+	ptr_c##A += ldc * n_count; \
+}
+
+#define N_MASK_STORE_4X(A, Bx, By) { \
+	_REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \
+	_REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \
+	switch(n_count) { \
+		case 3: _MASK_STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \
+		case 2: _MASK_STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \
+		case 1: _MASK_STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \
+	} \
+	ptr_c##A += ldc * n_count; \
+}
 
 
 int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
 {
-	IFLOAT *ptr_a = A, *ptr_b = B, *ptr_c = C;
+	IFLOAT *ptr_a = A, *ptr_b = B;
 	IFLOAT *ptr_b0, *ptr_b1;
 	IFLOAT *ptr_a0, *ptr_a1;
+	FLOAT *ptr_c = C;
+	FLOAT *ptr_c0, *ptr_c1;
 	BLASLONG n_count = n;
 	BLASLONG m_count, k_count;
 	BLASLONG n_blksize = 4 * k;
+	BLASLONG cn_offset = 0;
+	__m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha));
 
 	for (; n_count > 23; n_count -= 24) {
+		IFLOAT *ptr_b00 = ptr_b;
+		IFLOAT *ptr_b10 = ptr_b + n_blksize * 3;
+		ptr_a0 = ptr_a;
+		ptr_c = C + cn_offset * ldc;
 		m_count = m;
-		ptr_b0 = ptr_b;
-		ptr_b1 = ptr_b0 + n_blksize * 3;
 		for (; m_count > 15; m_count -= 16) {
-			DECLARE_A_PAIR(0); DECLARE_B_PAIR();
+			ptr_b0 = ptr_b00;
+			ptr_b1 = ptr_b10;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
 			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
 			DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2);
 			for (k_count = k; k_count > 1; k_count -=2) {
@@ -105,8 +209,8 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT *
 				BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
 				BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
 				BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
-				ptr_b0 += 24 * 2;
-				ptr_b1 += 24 * 2;
+				ptr_b0 += 4 * 2;
+				ptr_b1 += 4 * 2;
 				ptr_a0 += 16 * 2;
 			}
 			if (k_count > 0) {
@@ -117,10 +221,249 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT *
 				BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0);
 				BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1);
 				BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2);
-				ptr_b0 += 24;
-				ptr_b1 += 24;
+				ptr_b0 += 4;
+				ptr_b1 += 4;
 				ptr_a0 += 16;
 			}
+			ptr_c0 = ptr_c;
+			STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2);
+			STORE_4X(0, 1, 0); STORE_4X(0, 1, 1); STORE_4X(0, 1, 2);
+			ptr_c += 16;
+		}
+		if (m_count > 0) {
+			__mmask16 mmask = (1UL << m_count) - 1;
+			ptr_b0 = ptr_b00;
+			ptr_b1 = ptr_b10;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
+			DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				MASK_LOAD_A_PAIR(0);
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
+				ptr_b0 += 4 * 2;
+				ptr_b1 += 4 * 2;
+				ptr_a0 += m_count * 2;
+			}
+			if (k_count > 0) {
+				MASK_LOAD_A_PAIR_TAIL(0);
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
+				BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2);
+				ptr_b0 += 4;
+				ptr_b1 += 4;
+				ptr_a0 += m_count;
+			}
+			ptr_c0 = ptr_c;
+			MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2);
+			MASK_STORE_4X(0, 1, 0); MASK_STORE_4X(0, 1, 1); MASK_STORE_4X(0, 1, 2);
+			ptr_c += m_count;
+		}
+		ptr_b += 24 * k;
+		cn_offset += 24;
+	}
+	for (; n_count > 11; n_count -= 12) {
+		IFLOAT *ptr_b00 = ptr_b;
+		ptr_a0 = ptr_a;
+		ptr_a1 = ptr_a + 16 * k;
+		ptr_c = C + cn_offset * ldc;
+		m_count = m;
+		for (; m_count > 31; m_count -= 32) {
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0); DECLARE_A_PAIR(1);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
+			DECLARE_RESULT_4X(1, 0, 0); DECLARE_RESULT_4X(1, 0, 1); DECLARE_RESULT_4X(1, 0, 2);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				LOAD_A_PAIR(0); LOAD_A_PAIR(1);
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2);
+				ptr_b0 += 4 * 2;
+				ptr_a0 += 16 * 2;
+				ptr_a1 += 16 * 2;
+			}
+			if (k_count > 0) {
+				LOAD_A_PAIR_TAIL(0); LOAD_A_PAIR_TAIL(1);
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0);
+				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1);
+				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2);
+				ptr_b0 += 4;
+				ptr_a0 += 16;
+				ptr_a1 += 16;
+			}
+			ptr_c0 = ptr_c;
+			ptr_c1 = ptr_c + 16;
+			STORE_4X(0, 0, 0); STORE_4X(1, 0, 0);
+			STORE_4X(0, 0, 1); STORE_4X(1, 0, 1);
+			STORE_4X(0, 0, 2); STORE_4X(1, 0, 2);
+			ptr_c += 16 * 2;
+		}
+		if (m > 31) {
+			ptr_a0 = ptr_a1;
+		}
+		for (; m_count > 15; m_count -= 16) {
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				LOAD_A_PAIR(0);
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4 * 2;
+				ptr_a0 += 16 * 2;
+			}
+			if (k_count > 0) {
+				LOAD_A_PAIR_TAIL(0);
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4;
+				ptr_a0 += 16;
+			}
+			ptr_c0 = ptr_c;
+			STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2);
+			ptr_c += 16;
+		}
+		if (m_count > 0) {
+			__mmask16 mmask = (1UL << m_count) - 1;
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				MASK_LOAD_A_PAIR(0);
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4 * 2;
+				ptr_a0 += m_count * 2;
+			}
+			if (k_count > 0) {
+				MASK_LOAD_A_PAIR_TAIL(0);
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4;
+				ptr_a0 += m_count;
+			}
+			ptr_c0 = ptr_c;
+			MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2);
+			ptr_c += m_count;
+		}
+		ptr_b += 12 * k;
+		cn_offset += 12;
+	}
+	for (; n_count > 3; n_count -= 4) {
+		IFLOAT *ptr_b00 = ptr_b;
+		ptr_a0 = ptr_a;
+		ptr_c = C + cn_offset * ldc;
+		m_count = m;
+		for (; m_count > 15; m_count -= 16) {
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				LOAD_A_PAIR(0);
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += 4 * 2;
+				ptr_a0 += 16 * 2;
+			}
+			if (k_count > 0) {
+				LOAD_A_PAIR_TAIL(0);
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += 4;
+				ptr_a0 += 16;
+			}
+			ptr_c0 = ptr_c;
+			STORE_4X(0, 0, 0);
+			ptr_c += 16;
+		}
+		if (m_count > 0) {
+			__mmask16 mmask = (1UL << m_count) - 1;
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				MASK_LOAD_A_PAIR(0);
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += 4 * 2;
+				ptr_a0 += m_count * 2;
+			}
+			if (k_count > 0) {
+				MASK_LOAD_A_PAIR_TAIL(0);
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += 4;
+				ptr_a0 += m_count;
+			}
+			ptr_c0 = ptr_c;
+			MASK_STORE_4X(0, 0, 0);
+			ptr_c += m_count;
+		}
+		ptr_b += 4 * k;
+		cn_offset += 4;
+	}
+	if (n_count > 0) {
+		__mmask8 nmask = (1UL << n_count) - 1;
+		IFLOAT *ptr_b00 = ptr_b;
+		ptr_a0 = ptr_a;
+		ptr_c = C + cn_offset * ldc;
+		m_count = m;
+		for (; m_count > 15; m_count -= 16) {
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				LOAD_A_PAIR(0);
+				MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += n_count * 2;
+				ptr_a0 += 16 * 2;
+			}
+			if (k_count > 0) {
+				LOAD_A_PAIR_TAIL(0);
+				MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += n_count;
+				ptr_a0 += 16;
+			}
+			ptr_c0 = ptr_c;
+			N_STORE_4X(0, 0, 0);
+			ptr_c += 16;
+		}
+		if (m_count > 0) {
+			__mmask16 mmask = (1UL << m_count) - 1;
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				MASK_LOAD_A_PAIR(0);
+				MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += n_count * 2;
+				ptr_a0 += m_count * 2;
+			}
+			if (k_count > 0) {
+				MASK_LOAD_A_PAIR_TAIL(0);
+				MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += n_count;
+				ptr_a0 += m_count;
+			}
+			ptr_c0 = ptr_c;
+			N_MASK_STORE_4X(0, 0, 0);
+			ptr_c += m_count;
 		}
 	}
+	return 0;
 }
diff --git a/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c
index 523e3b48f..eefbd7355 100644
--- a/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c
+++ b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c
@@ -79,8 +79,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
 	aoffset = a;
 	boffset = b;
 
-	BLASLONG m32 = n & ~31;
-	BLASLONG m8 = n & ~7;
+	BLASLONG m32 = m & ~31;
+	BLASLONG m8 = m & ~7;
 	BLASLONG n4 = n & ~3;
 
 	int permute_table[] = {
@@ -115,15 +115,15 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
 			boffset += 32 * 4;
 		}
 		for (; i < m8; i += 8) {
-			__m128i r0 = _mm_loadu_si128(aoffset0 + i);
-			__m128i r1 = _mm_loadu_si128(aoffset1 + i);
-			__m128i r2 = _mm_loadu_si128(aoffset2 + i);
-			__m128i r3 = _mm_loadu_si128(aoffset3 + i);
+			__m128i r0 = _mm_loadu_si128((void *)(aoffset0 + i));
+			__m128i r1 = _mm_loadu_si128((void *)(aoffset1 + i));
+			__m128i r2 = _mm_loadu_si128((void *)(aoffset2 + i));
+			__m128i r3 = _mm_loadu_si128((void *)(aoffset3 + i));
 			REORDER_4x8(r0, r1, r2, r3);
-			_mm_storeu_si128(boffset + 8*0, r0);
-			_mm_storeu_si128(boffset + 8*1, r1);
-			_mm_storeu_si128(boffset + 8*2, r2);
-			_mm_storeu_si128(boffset + 8*3, r3);
+			_mm_storeu_si128((void *)(boffset + 8*0), r0);
+			_mm_storeu_si128((void *)(boffset + 8*1), r1);
+			_mm_storeu_si128((void *)(boffset + 8*2), r2);
+			_mm_storeu_si128((void *)(boffset + 8*3), r3);
 			boffset += 8 * 4;
 		}
 		if (i < m) {
@@ -138,9 +138,9 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
 			// store should skip the tail odd line
 			int num_store = remain_m/2;
 			switch(num_store) {
-				case 3: _mm_storeu_si128(boffset + 8*2, r0);
-				case 2: _mm_storeu_si128(boffset + 8*1, r0);
-				case 1: _mm_storeu_si128(boffset + 8*0, r0);
+				case 3: _mm_storeu_si128((void *)(boffset + 8*2), r2);
+				case 2: _mm_storeu_si128((void *)(boffset + 8*1), r1);
+				case 1: _mm_storeu_si128((void *)(boffset + 8*0), r0);
 			}
 			boffset += 8 * num_store;
 
@@ -152,7 +152,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
 				 * need to extract lo words of data and store
 				 */
 				tail = _mm_cvtepi32_epi16(tail);
-				_mm_store_sd(boffset, (__m128d) tail); // only lower 4 bfloat valid
+				_mm_store_sd((double *)boffset, (__m128d) tail); // only lower 4 bfloat valid
 				boffset += 4;
 			}
 		}
@@ -167,16 +167,16 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
 		__m128i r0, r1, r2, r3;
 		for (i = 0; i < m8; i += 8) {
 			switch (remain_n) {
-				case 3: r2 = _mm_loadu_si128(aoffset2 + i);
-				case 2: r1 = _mm_loadu_si128(aoffset1 + i);
-				case 1: r0 = _mm_loadu_si128(aoffset0 + i);
+				case 3: r2 = _mm_loadu_si128((void *)(aoffset2 + i));
+				case 2: r1 = _mm_loadu_si128((void *)(aoffset1 + i));
+				case 1: r0 = _mm_loadu_si128((void *)(aoffset0 + i));
 			}
 			REORDER_4x8(r0, r1, r2, r3);
-			_mm_mask_storeu_epi16(boffset + remain_n * 0, nmask, r0);
-			_mm_mask_storeu_epi16(boffset + remain_n * 1, nmask, r1);
-			_mm_mask_storeu_epi16(boffset + remain_n * 2, nmask, r2);
-			_mm_mask_storeu_epi16(boffset + remain_n * 3, nmask, r3);
-			boffset += 4 * remain_n;
+			_mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0);
+			_mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1);
+			_mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2);
+			_mm_mask_storeu_epi32(boffset + remain_n * 6, nmask, r3);
+			boffset += 8 * remain_n;
 		}
 		if (i < m) {
 			int remain_m = m - i;
@@ -190,9 +190,9 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
 
 			int num_store = remain_m/2;
 			switch (num_store) {
-				case 3: _mm_mask_storeu_epi16(boffset + remain_n * 2, nmask, r2);
-				case 2: _mm_mask_storeu_epi16(boffset + remain_n * 1, nmask, r1);
-				case 1: _mm_mask_storeu_epi16(boffset + remain_n * 0, nmask, r0);
+				case 3: _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2);
+				case 2: _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1);
+				case 1: _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0);
 			}
 			boffset += 2 * num_store * remain_n;
 
@@ -204,4 +204,5 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
 			}
 		}
 	}
+	return 0;
 }
diff --git a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c
index 16bf48f0b..ce4458d2c 100644
--- a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c
+++ b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c
@@ -29,134 +29,135 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <immintrin.h>
 #include "common.h"
 
+
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-  BLASLONG i, j;
-
-  IFLOAT *boffset0, *boffset1;
-
-  boffset0   = b;
-
-  BLASLONG n32 = n & ~31;
-  BLASLONG m4 = m & ~3;
-  BLASLONG m2 = m & ~1;
-
-  uint32_t permute_table = {
-    0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17,
-    0x08, 0x18, 0x09, 0x19, 0x0a, 0x1a, 0x0b, 0x1b, 0x0c, 0x1c, 0x0d, 0x1d, 0x0e, 0x1e, 0x0f, 0x1f,
-  };
-
-  __m512i idx_lo = _mm512_loadu_si512(permute_table);
-  __m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
-
-  for (j = 0; j < n32; j += 32) {
-    /* process 2x16 n at the same time */
-    boffset1 = boffset0 + m * 16;
-    for (i = 0; i < m4; i += 4) {
-      /* bf16 fma need special memory layout:
-       * for memory layout like below:
-       *     a00, a01, a02, a03, a04, a05 ....
-       *     a10, a11, a12, a13, a14, a15 ....
-       * need to copy as:
-       *     a00, a10, a01, a11, a02, a12, a03, a13, ...
-       */
-      __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
-      __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]);
-      __m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]);
-      __m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]);
-
-      __m512i a00 = _mm512_unpacklo_epi16(a0, a1);
-      __m512i a01 = _mm512_unpackhi_epi16(a0, a1);
-      __m512i a10 = _mm512_unpacklo_epi16(a2, a3);
-      __m512i a11 = _mm512_unpackhi_epi16(a2, a3);
-
-      a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
-      a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
-      a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11);
-      a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11);
-
-      _mm512_storeu_si512(boffset0, a0);
-      _mm512_storeu_si512(boffset1, a1);
-      _mm512_storeu_si512(boffset0 + 32, a2);
-      _mm512_storeu_si512(boffset1 + 32, a3);
-      boffset0 += 64;
-      boffset1 += 64;
-    }
-    for (; i < m2; i += 2) {
-      __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
-      __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]);
-
-      __m512i a00 = _mm512_unpacklo_epi16(a0, a1);
-      __m512i a01 = _mm512_unpackhi_epi16(a0, a1);
-
-      a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
-      a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
-
-      _mm512_storeu_si512(boffset0, a0);
-      _mm512_storeu_si512(boffset1, a1);
-      boffset0 += 32;
-      boffset1 += 32;
-    }
-    for (; i < m; i++) {
-      /* just copy the only remains row */
-      __m256i a0 = _mm256_loadu_si256(&a[(i + 0)*lda + j]);
-      __m256i a1 = _mm256_loadu_si256(&a[(i + 0)*lda + j + 16]);
-      _mm256_storeu_si256(boffset0, a0);
-      _mm256_storeu_si256(boffset1, a1);
-      boffset0 += 16;
-      boffset1 += 16;
-    }
-    boffset0 = boffset1;
-  }
-  if (j < n) {
-    uint32_t remains = n - j;
-    __mmask32 r_mask = (1UL << remains) - 1;
-    if (remains > 16) {
-      boffset1 = boffset0 + m * 16;
-      uint32_t tail1 = remains - 16;
-      __mmask16 w_mask1 = (1UL << tail1) - 1;
-      for (i = 0; i < m2; i += 2) {
-        __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
-        __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
-
-        __m512i a00 = _mm512_unpacklo_epi16(a0, a1);
-        __m512i a01 = _mm512_unpackhi_epi16(a0, a1);
-
-        a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
-        a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
-
-        _mm512_storeu_si512(boffset0, a0);
-        _mm512_mask_storeu_epi32(boffset1, w_mask1, a1);
-
-        boffset0 += 32;
-        boffset1 += 2 * tail1;
-      }
-      for (; i < m; i++) {
-        __m256i a0 = _mm256_loadu_si256(&a[(i + 0)*lda + j]);
-        __m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, &a[(i + 0)*lda + j + 16]);
-        _mm256_storeu_si256(boffset0, a0);
-        _mm256_mask_storeu_epi16(boffset1, w_mask1, a1);
-        boffset0 += 16;
-        boffset1 += tail1;
-      }
-    } else {
-      __mmask16 w_mask = (1UL << remains ) - 1;
-      for (i = 0; i < m2; i += 2) {
-        __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
-        __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
-
-        __m512i a00 = _mm512_unpacklo_epi16(a0, a1);
-        __m512i a01 = _mm512_unpackhi_epi16(a0, a1);
-
-        a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
-
-        _mm512_mask_storeu_epi32(boffset0, w_mask, a0);
-        boffset0 += 2 * remains;
-      }
-      for (; i < m; i++) {
-        __m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]);
-        _mm256_mask_storeu_epi16(boffset0, w_mask, a0);
-        boffset0 += remains;
-      }
-    }
-  }
+	BLASLONG i, j;
+
+	IFLOAT *boffset0, *boffset1;
+
+	boffset0   = b;
+
+	BLASLONG n32 = n & ~31;
+	BLASLONG m4 = m & ~3;
+	BLASLONG m2 = m & ~1;
+
+	uint32_t permute_table[] = {
+		0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+		0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+	};
+
+	__m512i idx_lo = _mm512_loadu_si512(permute_table);
+	__m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
+
+	for (j = 0; j < n32; j += 32) {
+		/* process 2x16 n at the same time */
+		boffset1 = boffset0 + m * 16;
+		for (i = 0; i < m4; i += 4) {
+			/* bf16 fma need special memory layout:
+			 * for memory layout like below:
+			 *     a00, a01, a02, a03, a04, a05 ....
+			 *     a10, a11, a12, a13, a14, a15 ....
+			 * need to copy as:
+			 *     a00, a10, a01, a11, a02, a12, a03, a13, ...
+			 */
+			__m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
+			__m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]);
+			__m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]);
+			__m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]);
+
+			__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
+			__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
+			__m512i a10 = _mm512_unpacklo_epi16(a2, a3);
+			__m512i a11 = _mm512_unpackhi_epi16(a2, a3);
+
+			a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
+			a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
+			a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11);
+			a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11);
+
+			_mm512_storeu_si512(boffset0, a0);
+			_mm512_storeu_si512(boffset1, a1);
+			_mm512_storeu_si512(boffset0 + 32, a2);
+			_mm512_storeu_si512(boffset1 + 32, a3);
+			boffset0 += 64;
+			boffset1 += 64;
+		}
+		for (; i < m2; i += 2) {
+			__m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
+			__m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]);
+
+			__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
+			__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
+
+			a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
+			a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
+
+			_mm512_storeu_si512(boffset0, a0);
+			_mm512_storeu_si512(boffset1, a1);
+			boffset0 += 32;
+			boffset1 += 32;
+		}
+		for (; i < m; i++) {
+			/* just copy the only remains row */
+			__m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]);
+			__m256i a1 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j + 16]);
+			_mm256_storeu_si256((void *)boffset0, a0);
+			_mm256_storeu_si256((void *)boffset1, a1);
+			boffset0 += 16;
+			boffset1 += 16;
+		}
+		boffset0 = boffset1;
+	}
+	if (j < n) {
+		uint32_t remains = n - j;
+		__mmask32 r_mask = (1UL << remains) - 1;
+		if (remains > 16) {
+			boffset1 = boffset0 + m * 16;
+			uint32_t tail1 = remains - 16;
+			__mmask16 w_mask1 = (1UL << tail1) - 1;
+			for (i = 0; i < m2; i += 2) {
+				__m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+				__m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
+
+				__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
+				__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
+
+				a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
+				a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
+
+				_mm512_storeu_si512(boffset0, a0);
+				_mm512_mask_storeu_epi32(boffset1, w_mask1, a1);
+
+				boffset0 += 32;
+				boffset1 += 2 * tail1;
+			}
+			for (; i < m; i++) {
+				__m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]);
+				__m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, (void *)&a[(i + 0)*lda + j + 16]);
+				_mm256_storeu_si256((void *)boffset0, a0);
+				_mm256_mask_storeu_epi16((void *)boffset1, w_mask1, a1);
+				boffset0 += 16;
+				boffset1 += tail1;
+			}
+		} else {
+			__mmask16 w_mask = (1UL << remains ) - 1;
+			for (i = 0; i < m2; i += 2) {
+				__m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+				__m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
+
+				__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
+				__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
+
+				a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
+
+				_mm512_mask_storeu_epi32(boffset0, w_mask, a0);
+				boffset0 += 2 * remains;
+			}
+			for (; i < m; i++) {
+				__m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]);
+				_mm256_mask_storeu_epi16(boffset0, w_mask, a0);
+				boffset0 += remains;
+			}
+		}
+	}
 }

From 8356a604f0bab4844827a1b622aa5c481157bd4b Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 17 Aug 2021 19:35:40 +0800
Subject: [PATCH 096/143] sbgemm: cooperlake: tuning for block params

---
 driver/others/parameter.c |  1 +
 param.h                   | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index 36da13369..d7dbddc7c 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -524,6 +524,7 @@ void blas_set_parameter(void){
   xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
 #endif
 
+  sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q *  4)) - 15) & ~15;
   sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
   dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
   cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q *  8)) - 15) & ~15;
diff --git a/param.h b/param.h
index 07397a66e..48770fa7a 100644
--- a/param.h
+++ b/param.h
@@ -1771,6 +1771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #define USE_SGEMM_KERNEL_DIRECT 1
 
+#undef SBGEMM_DEFAULT_UNROLL_N
+#undef SBGEMM_DEFAULT_UNROLL_M
+#undef SBGEMM_DEFAULT_P
+#undef SBGEMM_DEFAULT_R
+#undef SBGEMM_DEFAULT_Q
+#define SBGEMM_DEFAULT_UNROLL_N 4
+#define SBGEMM_DEFAULT_UNROLL_M 16
+#define SBGEMM_DEFAULT_P 384
+#define SBGEMM_DEFAULT_Q 768
+#define SBGEMM_DEFAULT_R sbgemm_r
+
 #ifdef ARCH_X86
 
 #define SGEMM_DEFAULT_UNROLL_M 4

From cece3541ab739f94add22fda840276033d0feb97 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 17 Aug 2021 21:13:29 +0800
Subject: [PATCH 097/143] sbgemm: cooperlake: fix bug in m64n12

---
 kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
index d604235c9..c257a3f60 100644
--- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
+++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
@@ -306,9 +306,8 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT *
 			STORE_4X(0, 0, 1); STORE_4X(1, 0, 1);
 			STORE_4X(0, 0, 2); STORE_4X(1, 0, 2);
 			ptr_c += 16 * 2;
-		}
-		if (m > 31) {
 			ptr_a0 = ptr_a1;
+			ptr_a1 = ptr_a0 + 16 * k;
 		}
 		for (; m_count > 15; m_count -= 16) {
 			ptr_b0 = ptr_b00;

From 45fdf951b64aa9145996727ecda901f00a2eda3c Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 17 Aug 2021 22:08:24 +0800
Subject: [PATCH 098/143] sbgemm: cooperlake: reorder ptr increase for
 performance

---
 kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
index c257a3f60..4c1f50650 100644
--- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
+++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
@@ -203,27 +203,27 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT *
 			DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2);
 			for (k_count = k; k_count > 1; k_count -=2) {
 				LOAD_A_PAIR(0);
+				ptr_a0 += 16 * 2;
 				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
 				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
 				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4 * 2;
 				BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
 				BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
 				BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
-				ptr_b0 += 4 * 2;
 				ptr_b1 += 4 * 2;
-				ptr_a0 += 16 * 2;
 			}
 			if (k_count > 0) {
 				LOAD_A_PAIR_TAIL(0);
+				ptr_a0 += 16;
 				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
 				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
 				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4;
 				BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0);
 				BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1);
 				BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2);
-				ptr_b0 += 4;
 				ptr_b1 += 4;
-				ptr_a0 += 16;
 			}
 			ptr_c0 = ptr_c;
 			STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2);
@@ -240,27 +240,27 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT *
 			DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2);
 			for (k_count = k; k_count > 1; k_count -=2) {
 				MASK_LOAD_A_PAIR(0);
+				ptr_a0 += m_count * 2;
 				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
 				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
 				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4 * 2;
 				BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
 				BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
 				BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
-				ptr_b0 += 4 * 2;
 				ptr_b1 += 4 * 2;
-				ptr_a0 += m_count * 2;
 			}
 			if (k_count > 0) {
 				MASK_LOAD_A_PAIR_TAIL(0);
+				ptr_a0 += m_count;
 				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
 				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
 				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4;
 				BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0);
 				BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1);
 				BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2);
-				ptr_b0 += 4;
 				ptr_b1 += 4;
-				ptr_a0 += m_count;
 			}
 			ptr_c0 = ptr_c;
 			MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2);
@@ -284,21 +284,21 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT *
 			DECLARE_RESULT_4X(1, 0, 0); DECLARE_RESULT_4X(1, 0, 1); DECLARE_RESULT_4X(1, 0, 2);
 			for (k_count = k; k_count > 1; k_count -=2) {
 				LOAD_A_PAIR(0); LOAD_A_PAIR(1);
+				ptr_a0 += 16 * 2;
+				ptr_a1 += 16 * 2;
 				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0);
 				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1);
 				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2);
 				ptr_b0 += 4 * 2;
-				ptr_a0 += 16 * 2;
-				ptr_a1 += 16 * 2;
 			}
 			if (k_count > 0) {
 				LOAD_A_PAIR_TAIL(0); LOAD_A_PAIR_TAIL(1);
+				ptr_a0 += 16;
+				ptr_a1 += 16;
 				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0);
 				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1);
 				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2);
 				ptr_b0 += 4;
-				ptr_a0 += 16;
-				ptr_a1 += 16;
 			}
 			ptr_c0 = ptr_c;
 			ptr_c1 = ptr_c + 16;
@@ -316,19 +316,19 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT *
 			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
 			for (k_count = k; k_count > 1; k_count -=2) {
 				LOAD_A_PAIR(0);
+				ptr_a0 += 16 * 2;
 				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
 				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
 				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
 				ptr_b0 += 4 * 2;
-				ptr_a0 += 16 * 2;
 			}
 			if (k_count > 0) {
 				LOAD_A_PAIR_TAIL(0);
+				ptr_a0 += 16;
 				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
 				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
 				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
 				ptr_b0 += 4;
-				ptr_a0 += 16;
 			}
 			ptr_c0 = ptr_c;
 			STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2);
@@ -342,19 +342,19 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT *
 			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
 			for (k_count = k; k_count > 1; k_count -=2) {
 				MASK_LOAD_A_PAIR(0);
+				ptr_a0 += m_count * 2;
 				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
 				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
 				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
 				ptr_b0 += 4 * 2;
-				ptr_a0 += m_count * 2;
 			}
 			if (k_count > 0) {
 				MASK_LOAD_A_PAIR_TAIL(0);
+				ptr_a0 += m_count;
 				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
 				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
 				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
 				ptr_b0 += 4;
-				ptr_a0 += m_count;
 			}
 			ptr_c0 = ptr_c;
 			MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2);

From 7a2d1601ec84c146b01eeb227d65b51c7855d1ef Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 17 Aug 2021 23:21:19 +0800
Subject: [PATCH 099/143] sbgemm: cooperlake: unroll core loop by 2

---
 kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
index 4c1f50650..0280b441e 100644
--- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
+++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
@@ -201,7 +201,31 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT *
 			DECLARE_B_PAIR();
 			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
 			DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2);
-			for (k_count = k; k_count > 1; k_count -=2) {
+			k_count = k;
+			for (; k_count > 3; k_count -=4) {
+				LOAD_A_PAIR(0);
+				ptr_a0 += 16 * 2;
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4 * 2;
+				BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
+				ptr_b1 += 4 * 2;
+
+				LOAD_A_PAIR(0);
+				ptr_a0 += 16 * 2;
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4 * 2;
+				BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
+				ptr_b1 += 4 * 2;
+			}
+			for (; k_count > 1; k_count -=2) {
 				LOAD_A_PAIR(0);
 				ptr_a0 += 16 * 2;
 				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);

From bb1c4fa5bdf93724075ed400e3ff5bbdabd0b31a Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Wed, 18 Aug 2021 21:17:08 +0800
Subject: [PATCH 100/143] sbgemm: cooperlake: prefetch A & B

---
 kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
index 0280b441e..7af51b6d8 100644
--- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
+++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
@@ -64,6 +64,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DECLARE_B_PAIR() \
 	__m512i B_lo; __m512i B_hi;
 
+#define PREFETCH_B_STEP 32
+#define PREFETCH_B(Bx, By) \
+	if (By == 0) asm("prefetcht0 %c1(%0)": : "r"(ptr_b##Bx), "n"(PREFETCH_B_STEP * 2)); \
+	else asm("prefetcht0 %c3(%0, %1, %c2)": : "r"(ptr_b##Bx), "r"(n_blksize), "n"(By*2), "n"(PREFETCH_B_STEP * 2))
+
 #define BROADCAST_B_PAIR(Bx, By) \
 	BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \
 	BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi);
@@ -204,17 +209,19 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT *
 			k_count = k;
 			for (; k_count > 3; k_count -=4) {
 				LOAD_A_PAIR(0);
+				_mm_prefetch(ptr_a0 + 128, _MM_HINT_T0);
 				ptr_a0 += 16 * 2;
-				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
-				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
-				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				BROADCAST_B_PAIR(0, 0); PREFETCH_B(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); PREFETCH_B(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); PREFETCH_B(0, 2); MATMUL_4X(0, 0, 2);
 				ptr_b0 += 4 * 2;
-				BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
-				BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
-				BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
+				BROADCAST_B_PAIR(1, 0); PREFETCH_B(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR(1, 1); PREFETCH_B(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR(1, 2); PREFETCH_B(1, 2); MATMUL_4X(0, 1, 2);
 				ptr_b1 += 4 * 2;
 
 				LOAD_A_PAIR(0);
+				_mm_prefetch(ptr_a0 + 128, _MM_HINT_T0);
 				ptr_a0 += 16 * 2;
 				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
 				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);

From 5fcacad32bb71fd6c6e04e078eeaf59120a9ba72 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 19 Aug 2021 00:08:06 +0800
Subject: [PATCH 101/143] sbgemm: cooperlake: implement tcopy_4

---
 kernel/x86_64/sbgemm_tcopy_16_cooperlake.c |  1 +
 kernel/x86_64/sbgemm_tcopy_4_cooperlake.c  | 86 ++++++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c
index ce4458d2c..88725f343 100644
--- a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c
+++ b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c
@@ -160,4 +160,5 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
 			}
 		}
 	}
+	return 0;
 }
diff --git a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c
index afcf6f647..74f30d44a 100644
--- a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c
+++ b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c
@@ -26,8 +26,94 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include <stdio.h>
+#include <immintrin.h>
 #include "common.h"
 
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+	BLASLONG i, j;
 
+	IFLOAT *boffset0, *boffset1;
+
+	boffset0   = b;
+
+	BLASLONG n8 = n & ~7;
+	BLASLONG m4 = m & ~3;
+	BLASLONG m2 = m & ~1;
+
+	for (j = 0; j < n8; j += 8) {
+		boffset1 = boffset0 + m * 4;
+		for (i = 0; i < m4; i +=4) {
+			__m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]);
+			__m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]);
+			__m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]);
+			__m128i a3 = _mm_loadu_si128((void *)&a[(i + 3)*lda + j]);
+			__m128i a00 = _mm_unpacklo_epi16(a0, a1);
+			__m128i a01 = _mm_unpackhi_epi16(a0, a1);
+			__m128i a10 = _mm_unpacklo_epi16(a2, a3);
+			__m128i a11 = _mm_unpackhi_epi16(a2, a3);
+			_mm_storeu_si128((void *)(boffset0 + 0), a00);
+			_mm_storeu_si128((void *)(boffset0 + 8), a10);
+			_mm_storeu_si128((void *)(boffset1 + 0), a01);
+			_mm_storeu_si128((void *)(boffset1 + 8), a11);
+			boffset0 += 16;
+			boffset1 += 16;
+		}
+		for (; i < m2; i+= 2) {
+			__m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]);
+			__m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]);
+			__m128i a00 = _mm_unpacklo_epi16(a0, a1);
+			__m128i a01 = _mm_unpackhi_epi16(a0, a1);
+			_mm_storeu_si128((void *)(boffset0 + 0), a00);
+			_mm_storeu_si128((void *)(boffset1 + 0), a01);
+			boffset0 += 8;
+			boffset1 += 8;
+		}
+		for (; i < m; i++) {
+			__m128d a0 = _mm_loadu_pd((void *)&a[(i + 0)*lda + j]);
+			_mm_store_sd((void *)boffset0, a0);
+			_mm_store_sd((void *)boffset1, _mm_permute_pd(a0, 0x1));
+			boffset0 += 4;
+			boffset1 += 4;
+		}
+		boffset0 = boffset1;
+	}
+	if (j < n) {
+		uint32_t remains = n - j;
+		__mmask8 r_mask = (1UL << remains) - 1;
+		if (remains > 4) {
+			boffset1 = boffset0 + m * 4;
+			uint32_t tail1 = remains - 4;
+			__mmask8 w_mask1 = (1UL << tail1) - 1;
+			for (i = 0; i < m2; i += 2) {
+				__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+				__m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
+				__m128i a00 = _mm_unpacklo_epi16(a0, a1);
+				__m128i a01 = _mm_unpackhi_epi16(a0, a1);
+				_mm_storeu_si128((void *)boffset0, a00);
+				_mm_mask_storeu_epi32((void *)boffset1, w_mask1, a01);
+				boffset0 += 8;
+				boffset1 += 2 * tail1;
+			}
+			for (; i < m; i++) {
+				__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+				_mm_store_sd((void *)boffset0, (__m128d) a0);
+				_mm_mask_storeu_epi16((void *)boffset1, w_mask1, (__m128i) _mm_permute_pd((__m128d) a0, 0x1));
+				boffset0 += 4;
+				boffset1 += tail1;
+			}
+		} else {
+			for (i = 0; i < m2; i += 2) {
+				__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+				__m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
+				__m128i a00 = _mm_unpacklo_epi16(a0, a1);
+				_mm_mask_storeu_epi32((void *)boffset0, r_mask, a00);
+				boffset0 += 2 * remains;
+			}
+			for (; i < m; i++) {
+				__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+				_mm_mask_storeu_epi16((void *)boffset0, r_mask, a0);
+			}
+		}
+	}
+	return 0;
 }

From beccb83b167b50e3742aa113aab51e57d0e9baa2 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 19 Aug 2021 19:46:08 +0800
Subject: [PATCH 102/143] sbgemm: cooperlake: add n24 kernel for tcopy_4

---
 kernel/x86_64/sbgemm_tcopy_4_cooperlake.c | 101 +++++++++++++++++++++-
 1 file changed, 99 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c
index 74f30d44a..e9edd4571 100644
--- a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c
+++ b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c
@@ -29,6 +29,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <immintrin.h>
 #include "common.h"
 
+#define STORE_VEC(Bx, By, vec) \
+	if (By == 0) asm("vmovdqu16 %0, (%1)": : "v"(vec), "r"(boffset##Bx)); \
+	else asm("vmovdqu16 %0, (%1, %2, %c3)": : "v"(vec), "r"(boffset##Bx), "r"(blk_size), "n"(By * 2));
+
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
 	BLASLONG i, j;
 
@@ -36,13 +40,106 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
 
 	boffset0   = b;
 
+	BLASLONG n24 = n - (n % 24);
 	BLASLONG n8 = n & ~7;
+	BLASLONG m8 = m & ~7;
 	BLASLONG m4 = m & ~3;
 	BLASLONG m2 = m & ~1;
 
-	for (j = 0; j < n8; j += 8) {
+	int permute_table[] = {
+		0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
+		0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
+		0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	};
+
+	j = 0;
+	if (n > 23) {
+		/* n = 24 is the max width in current blocking setting */
+		__m512i idx_lo_128 = _mm512_loadu_si512(permute_table);
+		__m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16);
+		__m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32);
+		__m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48);
+		__mmask32 mask24 = (1UL << 24) - 1;
+		BLASLONG blk_size = m * 4;
+		BLASLONG stride = blk_size * 3;
+
+		for (; j < n24; j += 24) {
+			boffset1 = boffset0 + stride;
+			for (i = 0; i < m8; i += 8) {
+				__m512i r0, r1, r2, r3, r4, r5, r6, r7;
+				__m512i t0, t1, t2, t3, t4, t5, t6, t7;
+				r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]);
+				r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]);
+				r2 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 2)*lda + j]);
+				r3 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 3)*lda + j]);
+				r4 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 4)*lda + j]);
+				r5 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 5)*lda + j]);
+				r6 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 6)*lda + j]);
+				r7 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 7)*lda + j]);
+
+				t0 = _mm512_unpacklo_epi16(r0, r1);
+				t1 = _mm512_unpackhi_epi16(r0, r1);
+				t2 = _mm512_unpacklo_epi16(r2, r3);
+				t3 = _mm512_unpackhi_epi16(r2, r3);
+				t4 = _mm512_unpacklo_epi16(r4, r5);
+				t5 = _mm512_unpackhi_epi16(r4, r5);
+				t6 = _mm512_unpacklo_epi16(r6, r7);
+				t7 = _mm512_unpackhi_epi16(r6, r7);
+
+				r0 = _mm512_permutex2var_epi32(t0, idx_lo_128, t2);
+				r1 = _mm512_permutex2var_epi32(t1, idx_lo_128, t3);
+				r2 = _mm512_permutex2var_epi32(t4, idx_lo_128, t6);
+				r3 = _mm512_permutex2var_epi32(t5, idx_lo_128, t7);
+				r4 = _mm512_permutex2var_epi32(t0, idx_hi_128, t2);
+				r5 = _mm512_permutex2var_epi32(t1, idx_hi_128, t3);
+				r6 = _mm512_permutex2var_epi32(t4, idx_hi_128, t6);
+				r7 = _mm512_permutex2var_epi32(t5, idx_hi_128, t7);
+
+				t0 = _mm512_permutex2var_epi32(r0, idx_lo_256, r2);
+				t1 = _mm512_permutex2var_epi32(r1, idx_lo_256, r3);
+				t2 = _mm512_permutex2var_epi32(r4, idx_lo_256, r6);
+				t3 = _mm512_permutex2var_epi32(r5, idx_lo_256, r7);
+				t4 = _mm512_permutex2var_epi32(r0, idx_hi_256, r2);
+				t5 = _mm512_permutex2var_epi32(r1, idx_hi_256, r3);
+
+				STORE_VEC(0, 0, t0); STORE_VEC(0, 1, t1); STORE_VEC(0, 2, t2);
+				STORE_VEC(1, 0, t3); STORE_VEC(1, 1, t4); STORE_VEC(1, 2, t5);
+				boffset0 += 32;
+				boffset1 += 32;
+			}
+			for (; i < m2; i += 2) {
+				__m512i r0, r1, t0, t1;
+				r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]);
+				r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]);
+				t0 = _mm512_unpacklo_epi16(r0, r1);
+				t1 = _mm512_unpackhi_epi16(r0, r1);
+				STORE_VEC(0, 0, _mm512_extracti32x4_epi32(t0, 0));
+				STORE_VEC(0, 1, _mm512_extracti32x4_epi32(t1, 0));
+				STORE_VEC(0, 2, _mm512_extracti32x4_epi32(t0, 1));
+				STORE_VEC(1, 0, _mm512_extracti32x4_epi32(t1, 1));
+				STORE_VEC(1, 1, _mm512_extracti32x4_epi32(t0, 2));
+				STORE_VEC(1, 2, _mm512_extracti32x4_epi32(t1, 2));
+				boffset0 += 8;
+				boffset1 += 8;
+			}
+			for (; i < m; i++) {
+				*(uint64_t *)(boffset0 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 0];
+				*(uint64_t *)(boffset0 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 4];
+				*(uint64_t *)(boffset0 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 8];
+				*(uint64_t *)(boffset1 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 12];
+				*(uint64_t *)(boffset1 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 16];
+				*(uint64_t *)(boffset1 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 20];
+				boffset0 += 4;
+				boffset1 += 4;
+			}
+			boffset0 += stride * 2;
+		}
+	}
+
+	for (; j < n8; j += 8) {
 		boffset1 = boffset0 + m * 4;
-		for (i = 0; i < m4; i +=4) {
+		for (i = 0; i < m4; i += 4) {
 			__m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]);
 			__m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]);
 			__m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]);

From 682d66555d050dd31a48e5337815b5e1422d8f80 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Fri, 20 Aug 2021 22:01:00 +0800
Subject: [PATCH 103/143] sbgemm: cooperlake: implement ncopy_16

---
 kernel/x86_64/sbgemm_ncopy_16_cooperlake.c | 320 +++++++++++++++++++++
 1 file changed, 320 insertions(+)

diff --git a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c
index afcf6f647..95ed82d7c 100644
--- a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c
+++ b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c
@@ -26,8 +26,328 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include <stdio.h>
+#include <immintrin.h>
 #include "common.h"
 
+#define _MM512_SHUFFLE_i32(result, in1, in2, imm8) \
+	asm("vshufps %3, %2, %1, %0": "=v"(result): "v"(in1), "v"(in2), "N"(imm8))
+
+#define REORDER_8x32(t0, t1, t2, t3, t4, t5, t6, t7) { \
+	__m512i v; \
+	t0 = _mm512_unpacklo_epi32(r0, r1); \
+	t1 = _mm512_unpackhi_epi32(r0, r1); \
+	t2 = _mm512_unpacklo_epi32(r2, r3); \
+	t3 = _mm512_unpackhi_epi32(r2, r3); \
+	t4 = _mm512_unpacklo_epi32(r4, r5); \
+	t5 = _mm512_unpackhi_epi32(r4, r5); \
+	t6 = _mm512_unpacklo_epi32(r6, r7); \
+	t7 = _mm512_unpackhi_epi32(r6, r7); \
+	_MM512_SHUFFLE_i32(v, t0, t2, 0x4E); \
+	r0 = _mm512_mask_blend_epi32(kc, t0, v); \
+	r1 = _mm512_mask_blend_epi32(k3, t2, v); \
+	_MM512_SHUFFLE_i32(v, t1, t3, 0x4E); \
+	r2 = _mm512_mask_blend_epi32(kc, t1, v); \
+	r3 = _mm512_mask_blend_epi32(k3, t3, v); \
+	_MM512_SHUFFLE_i32(v, t4, t6, 0x4E); \
+	r4 = _mm512_mask_blend_epi32(kc, t4, v); \
+	r5 = _mm512_mask_blend_epi32(k3, t6, v); \
+	_MM512_SHUFFLE_i32(v, t5, t7, 0x4E); \
+	r6 = _mm512_mask_blend_epi32(kc, t5, v); \
+	r7 = _mm512_mask_blend_epi32(k3, t7, v); \
+	t0 = _mm512_permutex2var_epi32(r0, idx_lo, r4); \
+	t1 = _mm512_permutex2var_epi32(r1, idx_lo, r5); \
+	t2 = _mm512_permutex2var_epi32(r2, idx_lo, r6); \
+	t3 = _mm512_permutex2var_epi32(r3, idx_lo, r7); \
+	t4 = _mm512_permutex2var_epi32(r0, idx_hi, r4); \
+	t5 = _mm512_permutex2var_epi32(r1, idx_hi, r5); \
+	t6 = _mm512_permutex2var_epi32(r2, idx_hi, r6); \
+	t7 = _mm512_permutex2var_epi32(r3, idx_hi, r7); \
+}
+
+#define STORE_512_LO(x) \
+	v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \
+	_mm512_storeu_si512(boffset0 + x*32, v);
+
+#define STORE_512_HI(x) \
+	v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \
+	_mm512_storeu_si512(boffset0 + (x + 8)*32, v);
+
+#define MASK_STORE_512_LO(x) \
+	v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \
+	_mm512_mask_storeu_epi32(boffset0 + 2*x*remain_n, nmask, v);
+
+#define MASK_STORE_512_HI(x) \
+	v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \
+	_mm512_mask_storeu_epi32(boffset0 + 2*(x + 8)*remain_n, nmask, v);
+
+#define STORE_512(x, y) {\
+	__m512i v; \
+	if (x == 0) { STORE_512_LO(y); } \
+	else { STORE_512_HI(y); } \
+}
+
+#define MASK_STORE_512(x, y) {\
+	__m512i v; \
+	if (x == 0) { MASK_STORE_512_LO(y); } \
+	else { MASK_STORE_512_HI(y); } \
+}
+
+#define SET_TAIL(y, x) {\
+	if (y == 0) tail = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \
+	else tail = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \
+}
+
+#define GET_TAIL() \
+	switch (n_store + 1) { \
+		case 16: SET_TAIL(1, 7); break; \
+		case 15: SET_TAIL(1, 6); break; \
+		case 14: SET_TAIL(1, 5); break; \
+		case 13: SET_TAIL(1, 4); break; \
+		case 12: SET_TAIL(1, 3); break; \
+		case 11: SET_TAIL(1, 2); break; \
+		case 10: SET_TAIL(1, 1); break; \
+		case  9: SET_TAIL(1, 0); break; \
+		case  8: SET_TAIL(0, 7); break; \
+		case  7: SET_TAIL(0, 6); break; \
+		case  6: SET_TAIL(0, 5); break; \
+		case  5: SET_TAIL(0, 4); break; \
+		case  4: SET_TAIL(0, 3); break; \
+		case  3: SET_TAIL(0, 2); break; \
+		case  2: SET_TAIL(0, 1); break; \
+		case  1: SET_TAIL(0, 0); break; \
+	}
+
+
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+	BLASLONG i, j;
+
+	IFLOAT *boffset0;
+	IFLOAT *aoffset;
+	IFLOAT *aoffset00, *aoffset01, *aoffset02, *aoffset03, *aoffset04, *aoffset05, *aoffset06, *aoffset07;
+	IFLOAT *aoffset10, *aoffset11, *aoffset12, *aoffset13, *aoffset14, *aoffset15, *aoffset16, *aoffset17;
+	aoffset = a;
+	boffset0   = b;
+
+	BLASLONG n16 = n & ~15;
+	BLASLONG m32 = m & ~31;
+
+	int permute_table[] = {
+		0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
+		0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
+	};
+	u_int64_t permute_table2[] = {
+		0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3,
+		0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7,
+	};
+	__m512i idx_lo = _mm512_loadu_si512(permute_table);
+	__m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
+	__m512i idx_lo2 = _mm512_loadu_si512(permute_table2);
+	__m512i idx_hi2 = _mm512_loadu_si512(permute_table2 + 8);
+	__mmask16 kc = 0xcccc;
+	__mmask16 k3 = 0x3333;
+	__m512i r0, r1, r2, r3, r4, r5, r6, r7;
+	__m512i t00, t01, t02, t03, t04, t05, t06, t07;
+	__m512i t10, t11, t12, t13, t14, t15, t16, t17;
+
+	for (j = 0; j < n16; j += 16) {
+		aoffset00 = aoffset;
+		aoffset01 = aoffset00 + lda;
+		aoffset02 = aoffset01 + lda;
+		aoffset03 = aoffset02 + lda;
+		aoffset04 = aoffset03 + lda;
+		aoffset05 = aoffset04 + lda;
+		aoffset06 = aoffset05 + lda;
+		aoffset07 = aoffset06 + lda;
+		aoffset10 = aoffset07 + lda;
+		aoffset11 = aoffset10 + lda;
+		aoffset12 = aoffset11 + lda;
+		aoffset13 = aoffset12 + lda;
+		aoffset14 = aoffset13 + lda;
+		aoffset15 = aoffset14 + lda;
+		aoffset16 = aoffset15 + lda;
+		aoffset17 = aoffset16 + lda;
+		aoffset += 16 * lda;
+		for (i = 0; i < m32; i += 32) {
+			r0 = _mm512_loadu_si512(aoffset00 + i);
+			r1 = _mm512_loadu_si512(aoffset01 + i);
+			r2 = _mm512_loadu_si512(aoffset02 + i);
+			r3 = _mm512_loadu_si512(aoffset03 + i);
+			r4 = _mm512_loadu_si512(aoffset04 + i);
+			r5 = _mm512_loadu_si512(aoffset05 + i);
+			r6 = _mm512_loadu_si512(aoffset06 + i);
+			r7 = _mm512_loadu_si512(aoffset07 + i);
+			REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
+			r0 = _mm512_loadu_si512(aoffset10 + i);
+			r1 = _mm512_loadu_si512(aoffset11 + i);
+			r2 = _mm512_loadu_si512(aoffset12 + i);
+			r3 = _mm512_loadu_si512(aoffset13 + i);
+			r4 = _mm512_loadu_si512(aoffset14 + i);
+			r5 = _mm512_loadu_si512(aoffset15 + i);
+			r6 = _mm512_loadu_si512(aoffset16 + i);
+			r7 = _mm512_loadu_si512(aoffset17 + i);
+			REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
+			STORE_512(0, 0); STORE_512(0, 1); STORE_512(0, 2); STORE_512(0, 3);
+			STORE_512(0, 4); STORE_512(0, 5); STORE_512(0, 6); STORE_512(0, 7);
+			STORE_512(1, 0); STORE_512(1, 1); STORE_512(1, 2); STORE_512(1, 3);
+			STORE_512(1, 4); STORE_512(1, 5); STORE_512(1, 6); STORE_512(1, 7);
+			boffset0 += 16 * 32;
+		}
+		if (i < m) {
+			int remain_m = m - i;
+			__mmask32 mmask = (1UL << remain_m) - 1;
+			r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i);
+			r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i);
+			r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i);
+			r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i);
+			r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i);
+			r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i);
+			r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i);
+			r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i);
+			REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
+			r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i);
+			r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i);
+			r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i);
+			r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i);
+			r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i);
+			r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i);
+			r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i);
+			r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i);
+			REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
+			int n_store = remain_m/2;
+			switch (n_store) {
+				case 15: STORE_512(1, 6);
+				case 14: STORE_512(1, 5);
+				case 13: STORE_512(1, 4);
+				case 12: STORE_512(1, 3);
+				case 11: STORE_512(1, 2);
+				case 10: STORE_512(1, 1);
+				case  9: STORE_512(1, 0);
+				case  8: STORE_512(0, 7);
+				case  7: STORE_512(0, 6);
+				case  6: STORE_512(0, 5);
+				case  5: STORE_512(0, 4);
+				case  4: STORE_512(0, 3);
+				case  3: STORE_512(0, 2);
+				case  2: STORE_512(0, 1);
+				case  1: STORE_512(0, 0);
+			}
+			boffset0 += n_store * 32;
+			if (m & 0x1) {
+				__m512i tail;
+				GET_TAIL();
+				_mm256_storeu_si256((void *)boffset0, _mm512_cvtepi32_epi16(tail));
+				boffset0 += 16;
+			}
+		}
 
+	}
+	if (j < n) {
+		int remain_n = n - j;
+		__mmask16 nmask = (1UL << remain_n) - 1;
+		int load0, load1;
+		if (remain_n > 8) {
+			load0 = 8;
+			load1 = remain_n - 8;
+		} else {
+			load0 = remain_n;
+			load1 = 0;
+		}
+		aoffset00 = aoffset;
+		aoffset01 = aoffset00 + lda;
+		aoffset02 = aoffset01 + lda;
+		aoffset03 = aoffset02 + lda;
+		aoffset04 = aoffset03 + lda;
+		aoffset05 = aoffset04 + lda;
+		aoffset06 = aoffset05 + lda;
+		aoffset07 = aoffset06 + lda;
+		aoffset10 = aoffset07 + lda;
+		aoffset11 = aoffset10 + lda;
+		aoffset12 = aoffset11 + lda;
+		aoffset13 = aoffset12 + lda;
+		aoffset14 = aoffset13 + lda;
+		aoffset15 = aoffset14 + lda;
+		aoffset16 = aoffset15 + lda;
+		aoffset17 = aoffset16 + lda;
+		aoffset += 16 * lda;
+		for (i = 0; i < m32; i += 32) {
+			switch (load0) {
+				case 8: r7 = _mm512_loadu_si512(aoffset07 + i);
+				case 7: r6 = _mm512_loadu_si512(aoffset06 + i);
+				case 6: r5 = _mm512_loadu_si512(aoffset05 + i);
+				case 5: r4 = _mm512_loadu_si512(aoffset04 + i);
+				case 4: r3 = _mm512_loadu_si512(aoffset03 + i);
+				case 3: r2 = _mm512_loadu_si512(aoffset02 + i);
+				case 2: r1 = _mm512_loadu_si512(aoffset01 + i);
+				case 1: r0 = _mm512_loadu_si512(aoffset00 + i);
+			}
+			REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
+			switch (load1) {
+				case 8: r7 = _mm512_loadu_si512(aoffset17 + i);
+				case 7: r6 = _mm512_loadu_si512(aoffset16 + i);
+				case 6: r5 = _mm512_loadu_si512(aoffset15 + i);
+				case 5: r4 = _mm512_loadu_si512(aoffset14 + i);
+				case 4: r3 = _mm512_loadu_si512(aoffset13 + i);
+				case 3: r2 = _mm512_loadu_si512(aoffset12 + i);
+				case 2: r1 = _mm512_loadu_si512(aoffset11 + i);
+				case 1: r0 = _mm512_loadu_si512(aoffset10 + i);
+			}
+			REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
+			MASK_STORE_512(0, 0); MASK_STORE_512(0, 1); MASK_STORE_512(0, 2); MASK_STORE_512(0, 3);
+			MASK_STORE_512(0, 4); MASK_STORE_512(0, 5); MASK_STORE_512(0, 6); MASK_STORE_512(0, 7);
+			MASK_STORE_512(1, 0); MASK_STORE_512(1, 1); MASK_STORE_512(1, 2); MASK_STORE_512(1, 3);
+			MASK_STORE_512(1, 4); MASK_STORE_512(1, 5); MASK_STORE_512(1, 6); MASK_STORE_512(1, 7);
+			boffset0 += remain_n * 32;
+		}
+		if (i < m) {
+			int remain_m = m - i;
+			__mmask32 mmask = (1UL << remain_m) - 1;
+			switch (load0) {
+				case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i);
+				case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i);
+				case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i);
+				case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i);
+				case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i);
+				case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i);
+				case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i);
+				case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i);
+			}
+			REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
+			switch (load1) {
+				case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i);
+				case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i);
+				case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i);
+				case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i);
+				case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i);
+				case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i);
+				case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i);
+				case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i);
+			}
+			REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
+			int n_store = remain_m/2;
+			switch (n_store) {
+				case 15: MASK_STORE_512(1, 6);
+				case 14: MASK_STORE_512(1, 5);
+				case 13: MASK_STORE_512(1, 4);
+				case 12: MASK_STORE_512(1, 3);
+				case 11: MASK_STORE_512(1, 2);
+				case 10: MASK_STORE_512(1, 1);
+				case  9: MASK_STORE_512(1, 0);
+				case  8: MASK_STORE_512(0, 7);
+				case  7: MASK_STORE_512(0, 6);
+				case  6: MASK_STORE_512(0, 5);
+				case  5: MASK_STORE_512(0, 4);
+				case  4: MASK_STORE_512(0, 3);
+				case  3: MASK_STORE_512(0, 2);
+				case  2: MASK_STORE_512(0, 1);
+				case  1: MASK_STORE_512(0, 0);
+			}
+			boffset0 += n_store * remain_n * 2;
+			if (m & 0x1) {
+				__m512i tail;
+				GET_TAIL();
+				_mm256_mask_storeu_epi16((void *)boffset0, nmask, _mm512_cvtepi32_epi16(tail));
+			}
+		}
+	}
+	return 0;
 }

From 59a1114d03b59794ae46eb6ae60b9a3b4b842709 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 7 Sep 2021 18:12:40 +0800
Subject: [PATCH 104/143] sbgemm: cooperlake: tuning for small matrix

---
 kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c
index 823aafbdd..70becd9fa 100644
--- a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c
+++ b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c
@@ -38,5 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
 {
-	return 1;
+	double MNK = (double) M * (double) N * (double) K;
+	if (MNK > 256.0*256.0*256.0)  // disable for big size matrix
+		return 0;
+	/* small matrix kernel works well for N = 8, 16, 32 */
+	if (N == 8 || N == 16 || N == 32)
+		return 1;
+	return 0;
 }

From 4289cf048dc1b5b735f65a3183f2c903c8f090bc Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 7 Sep 2021 18:34:26 +0800
Subject: [PATCH 105/143] sbgemm: avoid falling into SGEMM_KERNEL_DIRECT

---
 interface/gemm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/gemm.c b/interface/gemm.c
index 6dcc54041..71cc77a1b 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -326,7 +326,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
   PRINT_DEBUG_CNAME;
 
-#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT)
+#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT)
 #ifdef DYNAMIC_ARCH
  if (support_avx512() )
 #endif  

From 045ed5c91df1e4d330ff1a3e93a721f98552692b Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 7 Sep 2021 23:37:08 +0800
Subject: [PATCH 106/143] sbgemm: fix build error in BFLOAT16 disabled

---
 driver/others/parameter.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index d7dbddc7c..791e5dc27 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -524,7 +524,9 @@ void blas_set_parameter(void){
   xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
 #endif
 
+#ifdef BUILD_BFLOAT16
   sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q *  4)) - 15) & ~15;
+#endif
   sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
   dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
   cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q *  8)) - 15) & ~15;
@@ -630,7 +632,9 @@ void blas_set_parameter(void){
   xgemm_p =  16 * (size + 1);
 #endif
 
+#ifdef BUILD_BFLOAT16
   sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q *  4)) - 15) & ~15;
+#endif
   sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
   dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
   cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q *  8)) - 15) & ~15;

From b858e65476b0ece1ccd082c62dd23d5ff1cb44b0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 Sep 2021 10:51:59 +0200
Subject: [PATCH 107/143] migrate from deprecated ubuntu-16.04 vmImage

---
 azure-pipelines.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index b1bded639..5d4a1ecd3 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -19,7 +19,7 @@ jobs:
 # of gcc / glibc
 - job: manylinux1_gcc
   pool:
-    vmImage: 'ubuntu-16.04'
+    vmImage: 'ubuntu-latest'
   steps:
   - script: |
       echo "FROM quay.io/pypa/manylinux1_x86_64
@@ -35,7 +35,7 @@ jobs:
     displayName: Run manylinux1 docker build
 - job: Intel_SDE_skx
   pool:
-    vmImage: 'ubuntu-16.04'
+    vmImage: 'ubuntu-latest'
   steps:
   - script: |
       # at the time of writing the available Azure Ubuntu vm image

From 7f4aa106f27d11cfa7e394238f222cca4f93d1bd Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Wed, 8 Sep 2021 07:04:13 -0500
Subject: [PATCH 108/143] Fixing syntax error in makefile

Fixing syntax issue in Makefile.power added by recent commit
af19cda65aef4d033ae33213013c88b0a99f9da2
---
 Makefile.power | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.power b/Makefile.power
index 4e7478213..28a0bae08 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -12,7 +12,7 @@ endif
 ifeq ($(CORE), POWER10)
 ifneq ($(C_COMPILER), PGI)
 CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
-ifeq ($(F_COMPILER, IBM)
+ifeq ($(F_COMPILER), IBM)
 FCOMMON_OPT += -O2 -qrecur -qnosave
 else
 FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math

From d17238599b573350b166973619039e67fba12fdd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 11 Sep 2021 13:38:28 +0200
Subject: [PATCH 109/143] Add casts

---
 kernel/x86_64/dasum_microk_haswell-2.c | 16 ++++++++--------
 kernel/x86_64/sasum_microk_haswell-2.c | 12 ++++++------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c
index 4fc73ddd4..fd9da7ebe 100644
--- a/kernel/x86_64/dasum_microk_haswell-2.c
+++ b/kernel/x86_64/dasum_microk_haswell-2.c
@@ -38,10 +38,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
 
          __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff);
         for (i = 0; i < tail_index_AVX2; i += 16) {
-            accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
-            accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask);
-            accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
-            accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask);
+            accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask);
+            accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 4]), abs_mask);
+            accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask);
+            accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+12]), abs_mask);
         }
 
         accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
@@ -63,10 +63,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
 
         __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
         for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
-            accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
-            accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
-            accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
-            accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
+            accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
+            accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2);
+            accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
+            accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2);
         }
 
         accum_20 = accum_20 + accum_21 + accum_22 + accum_23;
diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c
index 8e6cb9a47..2eb5b9538 100644
--- a/kernel/x86_64/sasum_microk_haswell-2.c
+++ b/kernel/x86_64/sasum_microk_haswell-2.c
@@ -38,10 +38,10 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
 
         __m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
         for (i = 0; i < tail_index_AVX2; i += 32) {
-            accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
-            accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
-            accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask);
-            accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask);
+            accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask);
+            accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask);
+            accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+16]), abs_mask);
+            accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+24]), abs_mask);
         }
 
         accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
@@ -62,8 +62,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
 
         __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
         for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
-            accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
-            accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
+            accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
+            accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
         }
         
         accum_20 += accum_21;

From 20581bf303776f831c788ced24f179d720ec5c39 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 11 Sep 2021 14:36:27 +0200
Subject: [PATCH 110/143] Remove unused variable

---
 interface/zsyr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/zsyr.c b/interface/zsyr.c
index 71d4dbf29..c70bd819e 100644
--- a/interface/zsyr.c
+++ b/interface/zsyr.c
@@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, FLOAT  *ALPHA,
 void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) {
 
   FLOAT *buffer;
-  int trans, uplo;
+  int uplo;
   blasint info;
   FLOAT * ALPHA = &alpha;
   FLOAT alpha_r	= ALPHA[0];

From ef2471203068b64d648b1495c9399bc18e802788 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 11 Sep 2021 14:37:44 +0200
Subject: [PATCH 111/143] Move a conditionally used variable

---
 kernel/generic/dot.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/generic/dot.c b/kernel/generic/dot.c
index 5abbb735c..84568ee0b 100644
--- a/kernel/generic/dot.c
+++ b/kernel/generic/dot.c
@@ -47,7 +47,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
-        int n1 = n & -4;
 #if V_SIMD && !defined(DSDOT)
         const int vstep = v_nlanes_f32;
         const int unrollx4 = n & (-vstep * 4);
@@ -84,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
         }
         dot = v_sum_f32(vsum0);
 #elif defined(DSDOT)
+        int n1 = n & -4;
 		for (; i < n1; i += 4)
 		{
 			dot += (double) y[i] * (double) x[i]
@@ -92,6 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 			    + (double) y[i+3] * (double) x[i+3] ;
 		}
 #else
+        int n1 = n & -4;
 		for (; i < n1; i += 4)
 		{
 			dot += y[i] * x[i]

From 7d873a329f477c676b39719d4f83a87a506cc0b9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 11 Sep 2021 14:38:47 +0200
Subject: [PATCH 112/143] Add ifdefs around conditionally used functions

---
 kernel/x86_64/sgemv_n_4.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 90865c4b3..0d8cada75 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -115,6 +115,8 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT
 	
 #endif
 
+#ifndef HAVE_SGEMV_N_SKYLAKE_KERNEL				
+
 #ifndef HAVE_KERNEL_4x2
 
 static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
@@ -246,6 +248,8 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
 
 #endif
 
+#endif
+
 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
 
 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)

From 1085775bc68c7de6e4a93c0d920b5564c8e84706 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 11 Sep 2021 15:05:55 +0200
Subject: [PATCH 113/143] really remove the unused variable

---
 interface/zsyr.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/interface/zsyr.c b/interface/zsyr.c
index c70bd819e..54fb8a4e9 100644
--- a/interface/zsyr.c
+++ b/interface/zsyr.c
@@ -130,7 +130,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
 
   PRINT_DEBUG_CNAME;
 
-  trans = -1;
   uplo  = -1;
   info  =  0;
 

From 0925dfe2c9a287f1fadfd20ea718e89b722c4de0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 11 Sep 2021 15:30:19 +0200
Subject: [PATCH 114/143] One instance of kernel_4x1 is used even on SKX

---
 kernel/x86_64/sgemv_n_4.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 0d8cada75..e0778006f 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -172,6 +172,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 
 } 
 
+#endif
 #endif
 
 #ifndef HAVE_KERNEL_4x1
@@ -248,8 +249,6 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
 
 #endif
 
-#endif
-
 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
 
 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)

From 5e4f1e3677df7ca74fd9d3dd264de8ca095f0553 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 14 Sep 2021 16:09:46 +0200
Subject: [PATCH 115/143] Remove BFLOAT16 from the task list of
 GenerateNamedObject

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0330b2ce7..ef7457135 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -132,7 +132,7 @@ endif ()
 
 if (BUILD_BFLOAT16)
   message(STATUS "Building Half Precision")
-  list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
+  #  list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
 endif ()
 
 if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")

From 1c0a8a714a5b00b1773c8a91b9cd155007b10480 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 14 Sep 2021 16:10:58 +0200
Subject: [PATCH 116/143] Add defaults for SBGEMV kernels

---
 cmake/kernel.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake
index 0c102bae5..09ca5eb57 100644
--- a/cmake/kernel.cmake
+++ b/cmake/kernel.cmake
@@ -134,6 +134,8 @@ if (BUILD_BFLOAT16)
   set(SHSWAPKERNEL ../arm/swap.c)
   set(TOBF16KERNEL ../x86_64/tobf16.c)
   set(BF16TOKERNEL ../x86_64/bf16to.c)
+  set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
+  set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
 endif ()
 endmacro ()
 

From e02df9fc55d96388951901420d6be9ff9e404228 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 14 Sep 2021 16:12:27 +0200
Subject: [PATCH 117/143] Propagate BUILD_BFLOAT16 to CFLAGS

---
 cmake/system.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index 7d2672998..f56ded966 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -469,6 +469,9 @@ endif()
 if (BUILD_COMPLEX16)
 	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16")
 endif()
+if (BUILD_BFLOAT16)
+       set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16")
+endif()
 if(NOT MSVC)
 set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
 endif()

From 5f6a6092537f156d14e11bd5cd6f6b15c3f861ca Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 14 Sep 2021 16:13:57 +0200
Subject: [PATCH 118/143] Add sbgemv

---
 driver/level2/CMakeLists.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt
index 61367e596..3e9964ab1 100644
--- a/driver/level2/CMakeLists.txt
+++ b/driver/level2/CMakeLists.txt
@@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES})
     GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type})
   endif ()
 
+# special defines for complex
   if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
 
     foreach (u_source ${U_SOURCES})
@@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES})
   endif ()
 endforeach ()
 
+if (BUILD_BFLOAT16)
+  if (USE_THREAD)
+    GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16")
+    GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16")
+  endif ()
+endif ()
+
 if ( BUILD_COMPLEX AND NOT  BUILD_SINGLE)
   if (USE_THREAD)
 	  GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE")

From 2f8220d757e9db0d4b748232cbdb2582ff64f611 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 14 Sep 2021 16:14:43 +0200
Subject: [PATCH 119/143] Add sbgemm

---
 driver/level3/CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt
index 077862abc..75b25d039 100644
--- a/driver/level3/CMakeLists.txt
+++ b/driver/level3/CMakeLists.txt
@@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES})
   if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
     GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0)
   endif ()
+  if (BUILD_BFLOAT16)
+    GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
+    if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
+      GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
+    endif ()
+  endif ()
 endforeach ()
 
 if ( BUILD_COMPLEX16 AND NOT  BUILD_DOUBLE)

From c35739db5ee784ba5a210441b0f30962a2f36b01 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 14 Sep 2021 16:15:57 +0200
Subject: [PATCH 120/143] Add separate entries for BFLOAT16 functions and fix
 missing cblas_xerbla

---
 interface/CMakeLists.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 5346ecadd..ccb5fce3f 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -82,6 +82,7 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS})
   GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX})
   GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
 
+  GenerateNamedObjects("xerbla.c" "" "xerbla" ${CBLAS_FLAG} "" "" true)
   #sdsdot, dsdot
   if (BUILD_SINGLE OR BUILD_DOUBLE)
   GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE")
@@ -104,6 +105,15 @@ endif ()
   GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG})
   GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG})
 
+if (BUILD_BFLOAT16)
+	GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+endif ()
 
 # complex-specific sources
 foreach (float_type ${FLOAT_TYPES})

From ddf106f769637cbfa09ee3c3dbe3bfe4cb04ef56 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 14 Sep 2021 16:17:18 +0200
Subject: [PATCH 121/143] Add dedicated entries for BFLOAT16 kernels

---
 kernel/CMakeLists.txt | 105 ++++++++++++++++++++++++++++++++----------
 1 file changed, 80 insertions(+), 25 deletions(-)

diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index d8a230436..9ffbd944f 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -91,6 +91,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
     GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE")
     GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE")
 
+    # sbdot
+    if (BUILD_BFLOAT16)
+	    GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16")
+	    GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16")
+	    GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE")
+	    GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "SINGLE" "stobf16_k" false "" "" false "BFLOAT16")
+	    GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "DOUBLE" "dtobf16_k" false "" "" false "BFLOAT16")
+    endif()
+
     if ((BUILD_COMPLEX OR BUILD_DOUBLE)  AND NOT BUILD_SINGLE)
     GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE")
     GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE")
@@ -149,9 +158,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
     GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3)
     foreach (float_type ${FLOAT_TYPES})
       string(SUBSTRING ${float_type} 0 1 float_char)
-      if (${float_type} STREQUAL "BFLOAT16")
-	set (float_char "SB")
-      endif ()
       if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
         GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type})
         GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type})
@@ -185,6 +191,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
 	    GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE")
 	    GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE")
     endif ()
+    if (BUILD_BFLOAT16)
+	    GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16")
+	    GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16")
+    endif ()
     # Makefile.L3
     set(USE_TRMM false)
     string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
@@ -209,15 +219,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
 	  GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false  SINGLE)
     endif()
 
-    foreach (float_type SINGLE DOUBLE BFLOAT16)
+    foreach (float_type SINGLE DOUBLE)
       string(SUBSTRING ${float_type} 0 1 float_char)
-      if (${float_type} STREQUAL "BFLOAT16")
-        if (NOT ${BUILD_BFLOAT16})
-	  continue ()
-        else ()
-	  set (float_char "SB")
-      endif ()
-      endif ()
       GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
     endforeach()
     if (BUILD_COMPLEX16  AND NOT BUILD_DOUBLE)
@@ -253,11 +256,24 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
 	    GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE")
     endif ()
 
+    if (BUILD_BFLOAT16)
+        if (SBGEMMINCOPY)
+		GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16")
+        endif ()
+        if (SBGEMMITCOPY)
+		GenerateNamedObjects("${KERNELDIR}/${SBGEMMITCOPY}" "" "${SBGEMMITCOPYOBJ}" false "" "" true "BFLOAT16")
+        endif ()
+        if (SBGEMMONCOPY)
+		GenerateNamedObjects("${KERNELDIR}/${SBGEMMONCOPY}" "" "${SBGEMMONCOPYOBJ}" false "" "" true "BFLOAT16")
+        endif ()
+        if (SBGEMMOTCOPY)
+		GenerateNamedObjects("${KERNELDIR}/${SBGEMMOTCOPY}" "" "${SBGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16")
+        endif ()
+	GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16")
+	GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16")
+    endif ()
     foreach (float_type ${FLOAT_TYPES})
       string(SUBSTRING ${float_type} 0 1 float_char)
-      if (${float_type} STREQUAL "BFLOAT16")
-	set (float_char "SB")
-      endif ()
       if (${float_char}GEMMINCOPY)
         GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
       endif ()
@@ -568,6 +584,44 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
             GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
             GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
         endif ()
+	if (BUILD_BFLOAT16)
+      if (NOT DEFINED SBGEMM_SMALL_M_PERMIT)
+          set(SBGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_NN)
+          set(SBGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_NT)
+          set(SBGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_TN)
+          set(SBGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_TT)
+          set(SBGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_B0_NN)
+          set(SBGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_B0_NT)
+          set(SBGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_B0_TN)
+          set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_B0_TT)
+          set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
+      endif ()
+	    GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
+        endif ()
       endif ()
 
       if (NOT DEFINED ${float_char}OMATCOPY_CN)
@@ -702,6 +756,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
       #geadd
       GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type})
     endforeach ()
+
     if (BUILD_DOUBLE AND NOT BUILD_SINGLE)
 	    GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE")
 	    GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE")
@@ -840,22 +895,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
       GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE")
 
       if (SGEMMINCOPY)
-		    GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE")
+	    GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE")
       endif ()
-	    if (SGEMMITCOPY)
-		    GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE")
-	    endif ()
-	    if (SGEMMONCOPY)
-		    GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE")
-	    endif ()
-	    if (SGEMMOTCOPY)
-		    GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE")
+      if (SGEMMITCOPY)
+	    GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE")
+      endif ()
+      if (SGEMMONCOPY)
+	    GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE")
+      endif ()
+      if (SGEMMOTCOPY)
+	    GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE")
       endif ()
       GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE")
       GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE")
     endif ()
-      
-    if (BUILD_COMPLEX16  AND NOT BUILD_DOUBLE)
+
+    if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
 	GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX}  false "DOUBLE")
 	GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX}  false "DOUBLE")
     endif ()

From ce036a2fc0a593a780a7ecd12933afd93e265e85 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 14 Sep 2021 21:41:53 +0200
Subject: [PATCH 122/143] Add casts

---
 kernel/x86_64/dasum_microk_skylakex-2.c | 8 ++++----
 kernel/x86_64/sasum_microk_skylakex-2.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c
index aea8c02d9..83bc078b3 100644
--- a/kernel/x86_64/dasum_microk_skylakex-2.c
+++ b/kernel/x86_64/dasum_microk_skylakex-2.c
@@ -58,10 +58,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
 
         __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
         for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
-            accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
-            accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
-            accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
-            accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
+            accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
+            accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2);
+            accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
+            accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2);
         }
 
         accum_20 = accum_20 + accum_21 + accum_22 + accum_23;
diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c
index c8c69d1e0..fbc91b558 100644
--- a/kernel/x86_64/sasum_microk_skylakex-2.c
+++ b/kernel/x86_64/sasum_microk_skylakex-2.c
@@ -53,8 +53,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
 
         __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
         for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
-            accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
-            accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
+            accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
+            accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
         }
         
         accum_20 += accum_21;

From dd09f0173e90f98ec382ef5ce1ddf4d1eb7c67e8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 14 Sep 2021 21:52:26 +0200
Subject: [PATCH 123/143] Remove extraneous qualifiers from struct definition

---
 driver/others/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 48067923e..0185fa683 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2695,7 +2695,7 @@ static volatile struct {
 
 } memory[NUM_BUFFERS];
 
-static volatile struct newmemstruct 
+struct newmemstruct 
 {
   BLASULONG lock;
   void *addr;

From b751edf6248e1897d1966d4693b2be980b89f518 Mon Sep 17 00:00:00 2001
From: Rafael Cardoso Fernandes Sousa <rafaelcfsousa@ibm.com>
Date: Wed, 15 Sep 2021 13:36:07 -0500
Subject: [PATCH 124/143]  Fix unused variable warnings on Power

---
 kernel/power/drot.c                   | 4 +---
 kernel/power/idamax.c                 | 2 +-
 kernel/power/trsm_kernel_LN_power10.c | 1 -
 kernel/power/trsm_kernel_LT_power10.c | 1 -
 kernel/power/zgemv_n_4.c              | 1 -
 kernel/power/zgemv_n_power10.c        | 1 -
 6 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/kernel/power/drot.c b/kernel/power/drot.c
index 3229878e4..30c7411cc 100644
--- a/kernel/power/drot.c
+++ b/kernel/power/drot.c
@@ -110,8 +110,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
 	FLOAT temp;
 
 	if ( n <= 0     )  return(0);
@@ -139,7 +137,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 		BLASLONG n1 = n & -16;
 		if ( n1 > 0 )
 		{
-			drot_kernel_16(n1, x1, y1, c, s);
+			drot_kernel_16(n1, x, y, c, s);
 			i=n1;
 		}
 #endif
diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c
index 5016f67dd..f1ef00066 100644
--- a/kernel/power/idamax.c
+++ b/kernel/power/idamax.c
@@ -330,10 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
 
     if (inc_x == 1) {
 
-        BLASLONG n1 = n & -32;
 #if defined(_CALL_ELF) && (_CALL_ELF == 2)
 #if defined(__VEC__) || defined(__ALTIVEC__)
 
+        BLASLONG n1 = n & -32;
 	if (n1 > 0) {
 
             max = diamax_kernel_32(n1, x, &maxf);
diff --git a/kernel/power/trsm_kernel_LN_power10.c b/kernel/power/trsm_kernel_LN_power10.c
index 5ca1603a6..246c3a236 100644
--- a/kernel/power/trsm_kernel_LN_power10.c
+++ b/kernel/power/trsm_kernel_LN_power10.c
@@ -389,7 +389,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b,
    vector FLOAT *Vc6 = (vector FLOAT *) c6;
    vector FLOAT *Vc7 = (vector FLOAT *) c7;
    vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
-   int  j;
 
    b[120] = (c0[15] *= a[255]);
    b[121] = (c1[15] *= a[255]);
diff --git a/kernel/power/trsm_kernel_LT_power10.c b/kernel/power/trsm_kernel_LT_power10.c
index 14ff12fe4..51f3a4e61 100644
--- a/kernel/power/trsm_kernel_LT_power10.c
+++ b/kernel/power/trsm_kernel_LT_power10.c
@@ -390,7 +390,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b,
    vector FLOAT *Vc6 = (vector FLOAT *) c6;
    vector FLOAT *Vc7 = (vector FLOAT *) c7;
    vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
-   int  j;
 
    b[0] = (c0[0] *= a[0]);
    b[1] = (c1[0] *= a[0]);
diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c
index 1f7199c89..366c21681 100644
--- a/kernel/power/zgemv_n_4.c
+++ b/kernel/power/zgemv_n_4.c
@@ -607,7 +607,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
     BLASLONG i;
-    BLASLONG j;
     FLOAT *a_ptr;
     FLOAT *x_ptr;
     FLOAT *y_ptr;
diff --git a/kernel/power/zgemv_n_power10.c b/kernel/power/zgemv_n_power10.c
index f5bb8d70e..a545b00d8 100644
--- a/kernel/power/zgemv_n_power10.c
+++ b/kernel/power/zgemv_n_power10.c
@@ -738,7 +738,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
     BLASLONG i;
-    BLASLONG j;
     FLOAT *a_ptr;
     FLOAT *x_ptr;
     FLOAT *y_ptr;

From 99aa10b3ff8870f4718fc842ce80871247cb93af Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Sep 2021 22:10:43 +0200
Subject: [PATCH 125/143] Initialize abs_mask1 with itself to silence a gcc
 warning

actual initialization is via the _mm_cmpeq_ep18, which I've seen claimed to be the fastest way to set an xmm register to all 1s
---
 kernel/x86_64/casum_microk_skylakex-2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
index d51929f9f..b398aa6e1 100644
--- a/kernel/x86_64/casum_microk_skylakex-2.c
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -15,7 +15,7 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
     
     if (n2 < 64) {
         __m128 accum_10, accum_11, accum_12, accum_13;
-        __m128 abs_mask1;
+        __m128 abs_mask1 = abs_mask1;
 
         accum_10 = _mm_setzero_ps();
         accum_11 = _mm_setzero_ps();

From 8dfa61a61c0b6d1f9a742e3dc2ae455bb3703cc8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Sep 2021 22:11:35 +0200
Subject: [PATCH 126/143] Initialize abs_mask1 with itself to silence a gcc
 warning

---
 kernel/x86_64/zasum_microk_skylakex-2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
index b44c53801..e257a5456 100644
--- a/kernel/x86_64/zasum_microk_skylakex-2.c
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -16,7 +16,7 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
 
     if (n2 < 32) {
         __m128d accum_10, accum_11, accum_12, accum_13;
-        __m128d abs_mask1;
+        __m128d abs_mask1 = abs_mask1;
 
         accum_10 = _mm_setzero_pd();
         accum_11 = _mm_setzero_pd();

From 0e8b4adf22981f3bd8f80e7e1f9e58edec54a598 Mon Sep 17 00:00:00 2001
From: Rafael Cardoso Fernandes Sousa <rafaelcfsousa@ibm.com>
Date: Wed, 15 Sep 2021 22:18:48 +0000
Subject: [PATCH 127/143] Remove unused commented code (#if directive)

---
 driver/others/dynamic_power.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index d9c15b312..2847ea9ae 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8;
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
 extern gotoblas_t gotoblas_POWER9;
 #endif
-//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
-//     || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
-//#define HAVE_P10_SUPPORT 1
-//#endif
 #ifdef HAVE_P10_SUPPORT
 extern gotoblas_t gotoblas_POWER10;
 #endif

From 7d4a2215799772a4d81a3d3e3b8d7faa515c68b1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 17 Sep 2021 09:18:25 +0200
Subject: [PATCH 128/143] Remove unused TEMP2 and reshuffle to leave x18 unused
 (reserved on OSX)

---
 kernel/arm64/dgemm_tcopy_8.S | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/arm64/dgemm_tcopy_8.S b/kernel/arm64/dgemm_tcopy_8.S
index 9ab51ff57..7e5bf6080 100644
--- a/kernel/arm64/dgemm_tcopy_8.S
+++ b/kernel/arm64/dgemm_tcopy_8.S
@@ -50,11 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	B03		x16
 #define	B04		x17
 
-#define I		x18
-#define	J		x19
+#define I		x19
+#define	J		x20
 
-#define TEMP1		x20
-#define TEMP2		x21
+#define TEMP1		x21
 
 #define A_PREFETCH	2560
 #define B_PREFETCH	256

From 0a4ac4b5850b5dee9f285637f06a4594f2e10dc2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 17 Sep 2021 09:19:51 +0200
Subject: [PATCH 129/143] Use x21 for I to leave x18 unused (reserved on OSX)

---
 kernel/arm64/sgemm_tcopy_16.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S
index 46198b3a2..431f1ae2a 100644
--- a/kernel/arm64/sgemm_tcopy_16.S
+++ b/kernel/arm64/sgemm_tcopy_16.S
@@ -30,7 +30,7 @@ All rights reserved.
 #define	B00		x22
 
 
-#define I		x18
+#define I		x21
 #define	J		x19
 
 #define TEMP1		x20

From 7d751774465637c25ef45d8c0f2a2361553e3df4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 17 Sep 2021 09:24:11 +0200
Subject: [PATCH 130/143] Move temp to x21 to leave x18 unused (reserved on
 OSX)

---
 kernel/arm64/dtrmm_kernel_8x4.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S
index 0ac5a5f24..3d953266c 100644
--- a/kernel/arm64/dtrmm_kernel_8x4.S
+++ b/kernel/arm64/dtrmm_kernel_8x4.S
@@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alpha		x17
-#define temp		x18
+//#define temp		x18
 #define tempOffset	x19
 #define tempK		x20
+#define temp		x21
 
 #define alpha0		d10
 #define alphaV0		v10.d[0]

From 380940271b7647cc82000b4f34d681a3259d222f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 17 Sep 2021 09:28:19 +0200
Subject: [PATCH 131/143] Move temp to x21 to leave x18 unused (reserved on
 OSX)

---
 kernel/arm64/strmm_kernel_16x4.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S
index 985a0a9a6..a44326aeb 100644
--- a/kernel/arm64/strmm_kernel_16x4.S
+++ b/kernel/arm64/strmm_kernel_16x4.S
@@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alpha		w17
-#define temp		x18
+//#define temp		x18
 #define tempOffset	x19
 #define tempK		x20
+#define temp            x21
 
 #define alpha0		s10
 #define alphaV0		v10.s[0]

From 590fbff06e818c3135a0b80cfae5a471da7f4e09 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 17 Sep 2021 09:42:17 +0200
Subject: [PATCH 132/143] move alpha to x19/x20 to leave x18 unused for OSX

---
 kernel/arm64/zgemm_kernel_4x4.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S
index f8e877f3c..a65c4f581 100644
--- a/kernel/arm64/zgemm_kernel_4x4.S
+++ b/kernel/arm64/zgemm_kernel_4x4.S
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow2		x14
 #define pCRow3		x15
 #define pA		x16
-#define alphaR		x17
-#define alphaI		x18
+#define alphaR		x19
+#define alphaI		x20
 
 #define alpha0_R	d10
 #define alphaV0_R	v10.d[0]

From 90cc944625ce0405145bdde03af0bf4e19e3f1ce Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 17 Sep 2021 09:53:18 +0200
Subject: [PATCH 133/143] Move alphaI to x22 to leave x18 unused (reserved on
 OSX)

---
 kernel/arm64/ztrmm_kernel_4x4.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S
index 462acfe2b..cd053b896 100644
--- a/kernel/arm64/ztrmm_kernel_4x4.S
+++ b/kernel/arm64/ztrmm_kernel_4x4.S
@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alphaR		x17
-#define alphaI		x18
+#define alphaI		x22
 #define temp		x19
 #define tempOffset	x20
 #define tempK		x21

From 5c537a5de07909f66c64cd8128c4a44df6ac8ba4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 19 Sep 2021 14:54:35 +0200
Subject: [PATCH 134/143] Update README.md

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 88a5a5035..6ce85e08e 100644
--- a/README.md
+++ b/README.md
@@ -128,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
 - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
 - **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
+- **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support.
 - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
 - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
 - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
@@ -153,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 
 - **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
 - **Cortex-A53**: same as ARMV8 (different cpu specifications)
+- **Cortex-A55**: same as ARMV8 (different cpu specifications)
 - **Cortex A57**: Optimized Level-3 and Level-2 functions
 - **Cortex A72**: same as A57 ( different cpu specifications)
 - **Cortex A73**: same as A57 (different cpu specifications)
@@ -178,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 
 #### RISC-V
 
-- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
+- **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
   ```sh
   make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
   ```
+  (also known to work on C906)
 
 ### Support for multiple targets in a single library
 

From b7bb2e36b8b8197bf4ae794b0982dde0336e17bc Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <ardeleanalex@gmail.com>
Date: Sun, 26 Sep 2021 12:17:21 +0300
Subject: [PATCH 135/143] Makefile.system: adjust mipsel/mips64el ARCH
 variables

When building for MIPS{64} little-endian variants, the included makefiles
should be the same as for the big-endian.

There are already some adjustments being done for some ARCH names.
This change adds the ones for the `mipsel` and `mips64el` names, so that
the Makefile.mips{64} files get included.

This comes as a result of: https://github.com/openwrt/packages/issues/16649

Signed-off-by: Alexandru Ardelean <ardeleanalex@gmail.com>
---
 Makefile.system | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index 20db80d07..150dbef50 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -33,6 +33,10 @@ else ifeq ($(ARCH), armv7)
 override ARCH=arm
 else ifeq ($(ARCH), aarch64)
 override ARCH=arm64
+else ifeq ($(ARCH), mipsel)
+override ARCH=mips
+else ifeq ($(ARCH), mips64el)
+override ARCH=mips64
 else ifeq ($(ARCH), zarch)
 override ARCH=zarch
 endif

From ee5ca8a328bae3da45a15452e9772c67165fabe0 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Tue, 28 Sep 2021 18:22:15 +0800
Subject: [PATCH 136/143] x86_64: BFLOAT16: fix build warning

---
 kernel/x86_64/bf16_common_macros.h            | 36 ++++----
 kernel/x86_64/sbdot_microk_cooperlake.c       | 14 +--
 .../x86_64/sbgemm_block_microk_cooperlake.c   |  2 +-
 .../sbgemv_n_microk_cooperlake_template.c     | 11 ++-
 .../sbgemv_t_microk_cooperlake_template.c     | 91 +++++++++++++------
 5 files changed, 100 insertions(+), 54 deletions(-)

diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h
index 78db7abb2..cdb4beff6 100644
--- a/kernel/x86_64/bf16_common_macros.h
+++ b/kernel/x86_64/bf16_common_macros.h
@@ -56,25 +56,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n)      \
-    regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]);  \
-    regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]);  \
-    regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]);  \
-    regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]);  \
-    regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]);  \
-    regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]);  \
-    regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]);  \
-    regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]);
+    regArray##_0 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+0)*lda + idx_n]));  \
+    regArray##_1 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+1)*lda + idx_n]));  \
+    regArray##_2 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+2)*lda + idx_n]));  \
+    regArray##_3 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+3)*lda + idx_n]));  \
+    regArray##_4 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+4)*lda + idx_n]));  \
+    regArray##_5 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+5)*lda + idx_n]));  \
+    regArray##_6 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+6)*lda + idx_n]));  \
+    regArray##_7 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+7)*lda + idx_n]));
 
 
 #define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n)    \
-    regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]);  \
-    regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]);  \
-    regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]);  \
-    regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]);  \
-    regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]);  \
-    regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]);  \
-    regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]);  \
-    regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]);
+    regArray##_0 = _mm_loadu_si128((__m128i *)(&a[(idx_m+0)*lda + idx_n]));  \
+    regArray##_1 = _mm_loadu_si128((__m128i *)(&a[(idx_m+1)*lda + idx_n]));  \
+    regArray##_2 = _mm_loadu_si128((__m128i *)(&a[(idx_m+2)*lda + idx_n]));  \
+    regArray##_3 = _mm_loadu_si128((__m128i *)(&a[(idx_m+3)*lda + idx_n]));  \
+    regArray##_4 = _mm_loadu_si128((__m128i *)(&a[(idx_m+4)*lda + idx_n]));  \
+    regArray##_5 = _mm_loadu_si128((__m128i *)(&a[(idx_m+5)*lda + idx_n]));  \
+    regArray##_6 = _mm_loadu_si128((__m128i *)(&a[(idx_m+6)*lda + idx_n]));  \
+    regArray##_7 = _mm_loadu_si128((__m128i *)(&a[(idx_m+7)*lda + idx_n]));
 
 
 #define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n)       \
@@ -153,11 +153,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #define BF16_VECTOR_LOAD_1x16(reg, x, idx_n)     \
-    reg = _mm256_loadu_si256(x + idx_n);
+    reg = _mm256_loadu_si256((__m256i *)(x + idx_n));
 
 
 #define BF16_VECTOR_LOAD_1x8(reg, x, idx_n)      \
-    reg = _mm_loadu_si128(x + idx_n);
+    reg = _mm_loadu_si128((__m128i *)(x + idx_n));
 
 
 #define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask)     \
diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c
index 067726cb1..2aefe46ff 100644
--- a/kernel/x86_64/sbdot_microk_cooperlake.c
+++ b/kernel/x86_64/sbdot_microk_cooperlake.c
@@ -79,21 +79,21 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
         __m256 accum256_1 = _mm256_setzero_ps();
         int tail_index_32  = n&(~31);
         for (int j = 0; j < tail_index_32; j += 32) {
-            accum256   = _mm256_dpbf16_ps(accum256,   (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0]));
-            accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16]));
+            accum256   = _mm256_dpbf16_ps(accum256,   (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+ 0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+ 0]));
+            accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+16]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+16]));
         }
         accum256 = _mm256_add_ps(accum256, accum256_1);
 
         /* Processing the remaining <32 chunk with 16-elements processing */
         if ((n&16) != 0) {
-            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32]));
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[tail_index_32]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[tail_index_32]));
         }
         accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));
 
         /* Processing the remaining <16 chunk with 8-elements processing */
         if ((n&8) != 0) {
             int tail_index_16  = n&(~15);
-            accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16]));
+            accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16]));
         }
 
         /* Processing the remaining <8 chunk with masked 8-elements processing */
@@ -108,13 +108,13 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
     } else if (n > 15) { /* n range from 16 to 31 */
         /* Processing <32 chunk with 16-elements processing */
         __m256 accum256   = _mm256_setzero_ps();
-        accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0]));
+        accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[0]));
         accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));
 
         /* Processing the remaining <16 chunk with 8-elements processing */
         if ((n&8) != 0) {
             int tail_index_16  = n&(~15);
-            accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16]));
+            accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16]));
         }
 
         /* Processing the remaining <8 chunk with masked 8-elements processing */
@@ -128,7 +128,7 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
         }
     } else if (n > 7) { /* n range from 8 to 15 */
         /* Processing <16 chunk with 8-elements processing */
-        accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0]));
+        accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[0]), (__m128bh) _mm_loadu_si128((__m128i *)&y[0]));
 
         /* Processing the remaining <8 chunk with masked 8-elements processing */
         if ((n&7) != 0) {
diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c
index 2c27221ac..b8c41f4f7 100644
--- a/kernel/x86_64/sbgemm_block_microk_cooperlake.c
+++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c
@@ -1246,7 +1246,7 @@ void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat
 // K=Any number but will be processed based on 32, M<=16
 void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
 {
-    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
+    bfloat16 * src_addr0;
     bfloat16 * dst_addr0, * dst_addr1;
 
     BLASLONG tag_k_32x = k & (~31);
diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
index 46e6d0ff9..4711e9720 100644
--- a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
+++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
@@ -30,6 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // Include common macros for BF16 based operations with IA intrinsics
 #include "bf16_common_macros.h"
 
+#undef STORE16_COMPLETE_RESULT
+#undef STORE16_MASK_COMPLETE_RESULT
+#undef STORE8_COMPLETE_RESULT
+#undef STORE8_MASK_COMPLETE_RESULT
+#undef STORE4_COMPLETE_RESULT
+#undef STORE4_MASK_COMPLETE_RESULT
+
 #ifndef ZERO_BETA  // Beta is non-zero
 
 #ifndef ONE_BETA       // BETA is not ONE
@@ -103,7 +110,9 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3;
@@ -202,7 +211,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
         unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31)));
         __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
 
-        unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
+        unsigned int store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
         __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value);
 
         accum512_0 = _mm512_setzero_ps();
diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c
index 51e681add..8a3a022fb 100644
--- a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c
+++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c
@@ -29,6 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // Include common macros for BF16 based operations with IA intrinsics
 #include "bf16_common_macros.h"
 
+#undef STORE16_COMPLETE_RESULT
+#undef STORE16_MASK_COMPLETE_RESULT
+#undef STORE8_COMPLETE_RESULT
+#undef STORE8_MASK_COMPLETE_RESULT
+#undef STORE4_COMPLETE_RESULT
+#undef STORE4_MASK_COMPLETE_RESULT
+
 #ifndef ZERO_BETA  // Beta is non-zero
 
 #ifndef ONE_BETA       // BETA is not ONE
@@ -231,7 +238,9 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     unsigned char load_mask_value = (((unsigned char)0xff) >> 6);
@@ -280,7 +289,7 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
     } else if (tail_num == 8) {
         __m256 result256 = _mm256_setzero_ps();
 
-        __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]);     // Load 8 rows with n=2
+        __m256i matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*2]);     // Load 8 rows with n=2
         __m256i xArray256 = _mm512_castsi512_si256(xArray);
         result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256);
 
@@ -323,7 +332,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
@@ -395,9 +406,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
             result256_0 = _mm256_setzero_ps();
             result256_1 = _mm256_setzero_ps();
 
-            matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
-            matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
-            matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]);  // Load 5 rows with n=3 plus 1 element
+            matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+            matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
+            matrixArray256_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+10)*3 + 2)]);  // Load 5 rows with n=3 plus 1 element
 
             matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
             matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2);  // Select the first 2 elements for each row
@@ -423,8 +434,8 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
             if (tail_num > 10) {
                 unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1)));
                 __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
-                matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
-                matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
+                matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+                matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
                 matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]);  // Load m-tag_m_32x-10 rows
 
                 matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
@@ -439,7 +450,7 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
             } else if (tail_num > 5) {
                 unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2)));
                 __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
-                matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+                matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
                 matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]);   // Load m-tag_m_32x-5 rows
                 matrixArray256_2 = _mm256_setzero_si256();
 
@@ -499,7 +510,9 @@ static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
@@ -591,7 +604,9 @@ static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     __m512  result_0, result_1;
@@ -782,7 +797,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
@@ -866,9 +883,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
 
             result256_0 = _mm256_setzero_ps();
 
-            matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]);          // Load 2 rows with n=6 plus 4 element
-            matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]);  // Load 2 rows with n=6 plus 4 element
-            matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]);  // Load 2 rows with n=6 plus 4 element
+            matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_16x)*6]);          // Load 2 rows with n=6 plus 4 element
+            matrixArray_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+2)*6 + 4)]);  // Load 2 rows with n=6 plus 4 element
+            matrixArray_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+5)*6 + 2)]);  // Load 2 rows with n=6 plus 4 element
 
             // Process the 0|1 elements
             // Select the 0|1 elements for each row
@@ -957,7 +974,9 @@ static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_2 = _mm512_set1_epi32(2);
@@ -1110,7 +1129,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
 {
     BLASLONG tag_m_16x  = m & (~15);
 
-    __m128i x128 = _mm_loadu_si128(x);               // |x0|x1|x2|x3|x4|x5|x6|x7|
+    __m128i x128 = _mm_loadu_si128((__m128i *)x);               // |x0|x1|x2|x3|x4|x5|x6|x7|
 
     if (tag_m_16x > 0) {
         __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
@@ -1122,7 +1141,9 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_2 = _mm512_set1_epi32(2);
@@ -1214,7 +1235,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
         __m128  result128, tmp128;
         for (BLASLONG i = tag_m_16x; i < m; i++) {
             result128 = _mm_setzero_ps();
-            matrixArray128 = _mm_loadu_si128(&a[(i)*8]);       // Load 1 rows with n=8
+            matrixArray128 = _mm_loadu_si128((__m128i *)&a[(i)*8]);       // Load 1 rows with n=8
             result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
             tmp128 = _mm_shuffle_ps(result128, result128, 14);
             result128 = _mm_add_ps(result128, tmp128);
@@ -1258,7 +1279,7 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
 
     unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7);
     __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
-    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1|x2|x3|x4|x5|x6|x7|
+    __m128i x128_0 = _mm_loadu_si128((__m128i *)x);                         // |x0|x1|x2|x3|x4|x5|x6|x7|
     __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|0 |0 | 0| 0| 0| 0| 0|
 
     if (tag_m_14x > 0) {
@@ -1271,7 +1292,9 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m256i M256_EPI16_2 = _mm256_set1_epi16(2);
@@ -1390,7 +1413,7 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
 
     unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3);
     __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
-    __m128i x128_0 = _mm_loadu_si128(x);                                  // |x0|x1|x2|x3|x4|x5|x6|x7|
+    __m128i x128_0 = _mm_loadu_si128((__m128i *)x);                                  // |x0|x1|x2|x3|x4|x5|x6|x7|
     __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8));           // |x8|x9|0 | 0| 0| 0| 0| 0|
 
     if (tag_m_12x > 0) {
@@ -1403,7 +1426,9 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m256i M256_EPI32_1 = _mm256_set1_epi32(1);
@@ -1522,7 +1547,7 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
 
     unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
     __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
-    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1| x2|x3|x4|x5|x6|x7|
+    __m128i x128_0 = _mm_loadu_si128((__m128i *)x);                         // |x0|x1| x2|x3|x4|x5|x6|x7|
     __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|x9|x10| 0| 0| 0| 0| 0|
 
     if (tag_m_15x > 0) {
@@ -1535,7 +1560,9 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
@@ -1690,7 +1717,7 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
 
     unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4);
     __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
-    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1| x2| x3|x4|x5|x6|x7|
+    __m128i x128_0 = _mm_loadu_si128((__m128i *)x);                         // |x0|x1| x2| x3|x4|x5|x6|x7|
     __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|x9|x10|x11| 0| 0| 0| 0|
 
     if (tag_m_15x > 0) {
@@ -1703,7 +1730,9 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
@@ -1873,16 +1902,15 @@ static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
         __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
         __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
 
-        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6);
-        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
-
         // Prepare X with 2-step interleave way
         xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
         BF16_INTERLEAVE_1x32(xArray)
@@ -2045,7 +2073,9 @@ static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
@@ -2207,16 +2237,15 @@ static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
         __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
         __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
 
-        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2);
-        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
-
         // Prepare X with 2-step interleave way
         xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
         BF16_INTERLEAVE_1x32(xArray)
@@ -2364,7 +2393,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
 {
     BLASLONG tag_m_16x  = m & (~15);
 
-    __m256i x256 = _mm256_loadu_si256(x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|
+    __m256i x256 = _mm256_loadu_si256((__m256i *)x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|
 
     if (tag_m_16x > 0) {
         __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
@@ -2377,7 +2406,9 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
@@ -2484,7 +2515,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m128  accum128, tmp128;
         for (BLASLONG i = tag_m_16x; i < m; i++) {
             accum256 = _mm256_setzero_ps();
-            matrixArray256 = _mm256_loadu_si256(&a[(i)*16]);       // Load 1 rows with n=16
+            matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(i)*16]);       // Load 1 rows with n=16
             accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
             accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
             tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
@@ -2535,7 +2566,9 @@ static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
@@ -2647,8 +2680,6 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b
     BLASLONG tag_n_32x  = n & (~31);
     BLASLONG tag_n_128x = n & (~127);
 
-    __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
-           accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
     __m512 accum512_bridge[8];
     __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3;
     __m256 accum256_0;
@@ -2658,7 +2689,9 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3;
@@ -2825,7 +2858,9 @@ static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7;
@@ -2961,7 +2996,9 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16
     __m512  ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha));
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_castps256_ps512(_mm256_set1_ps(beta));
+#endif
 #endif
 
     __m256  accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \
@@ -3012,7 +3049,7 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16
             __m128  accum128, tmp128;
             for (BLASLONG i = tag_m_8x; i < m; i++) {
                 accum256_0 = _mm256_setzero_ps();
-                matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]);       // Load 1 rows with n=16
+                matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(i)*lda]);       // Load 1 rows with n=16
                 accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256);
                 accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
                 tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);

From 2d33e12a119f0cf97e5c41ff4f6499e9229d9bd5 Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@alumni.harvard.edu>
Date: Thu, 30 Sep 2021 03:14:15 -0400
Subject: [PATCH 137/143] Make sure that Netlib LAPACK respects FFLAGS

OpenBLAS allows users to specify `FFLAGS` and then uses `override` to append additional
options. However, without such an override in lapack's make.inc, lapack would use the external
FFLAGS, rather than the ones being computed by OpenBLAS. For example the `DEBUG=1` flag
would not apply to LAPACK code. This is all a bit messy but forced by the integration with netlib
lapack. Note that `CFLAGS` already has this override for the same reason. It is possible that
other variables here should have a similar override, but I think for most of the other ones, OpenBLAS's
build system does not append to the flags passed in by the user.
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 555d1c467..49fd57ff2 100644
--- a/Makefile
+++ b/Makefile
@@ -269,7 +269,7 @@ prof_lapack : lapack_prebuild
 lapack_prebuild :
 ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 	-@echo "FC          = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "FFLAGS      = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "override FFLAGS      = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "FFLAGS_DRV  = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "FFLAGS_NOOPT       = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc

From 2be5ee3cca97a597f2ee2118808a2d5eacea050c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 1 Oct 2021 11:17:21 +0200
Subject: [PATCH 138/143] Fix out of bounds read in ?llarv (Reference-LAPACK PR
 625)

---
 lapack-netlib/SRC/clarrv.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/clarrv.f b/lapack-netlib/SRC/clarrv.f
index a45f55ac3..26a9febc8 100644
--- a/lapack-netlib/SRC/clarrv.f
+++ b/lapack-netlib/SRC/clarrv.f
@@ -351,7 +351,7 @@
 *
 *     Quick return if possible
 *
-      IF( N.LE.0 ) THEN
+      IF( (N.LE.0) .OR. (M.LE.0) ) THEN
          RETURN
       END IF
 *

From fe497efa0510466fd93578aaf9da1ad8ed4edbe7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 1 Oct 2021 11:18:20 +0200
Subject: [PATCH 139/143] Fix out of bounds read in ?llarv (Reference-LAPACK PR
 625)

---
 lapack-netlib/SRC/dlarrv.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/dlarrv.f b/lapack-netlib/SRC/dlarrv.f
index 4a59a2bbf..a1c6e9c9d 100644
--- a/lapack-netlib/SRC/dlarrv.f
+++ b/lapack-netlib/SRC/dlarrv.f
@@ -353,7 +353,7 @@
 *
 *     Quick return if possible
 *
-      IF( N.LE.0 ) THEN
+      IF( (N.LE.0).OR.(M.LE.0) ) THEN
          RETURN
       END IF
 *

From ddb0ff5353637bb5f5ad060c9620e334c143e3d7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 1 Oct 2021 11:19:07 +0200
Subject: [PATCH 140/143] Fix out of bounds read in ?llarv (Reference-LAPACK PR
 625)

---
 lapack-netlib/SRC/slarrv.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/slarrv.f b/lapack-netlib/SRC/slarrv.f
index 04519fde8..9448b2fd9 100644
--- a/lapack-netlib/SRC/slarrv.f
+++ b/lapack-netlib/SRC/slarrv.f
@@ -353,7 +353,7 @@
 *
 *     Quick return if possible
 *
-      IF( N.LE.0 ) THEN
+      IF( (N.LE.0).OR.(M.LE.0) ) THEN
          RETURN
       END IF
 *

From 337b65133df174796794871b3988cd03426e6d41 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 1 Oct 2021 11:19:53 +0200
Subject: [PATCH 141/143] Fix out of bounds read in ?llarv (Reference-LAPACK PR
 625)

---
 lapack-netlib/SRC/zlarrv.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/zlarrv.f b/lapack-netlib/SRC/zlarrv.f
index 23976dbef..8d10e3c2e 100644
--- a/lapack-netlib/SRC/zlarrv.f
+++ b/lapack-netlib/SRC/zlarrv.f
@@ -351,7 +351,7 @@
 *
 *     Quick return if possible
 *
-      IF( N.LE.0 ) THEN
+      IF( (N.LE.0).OR.(M.LE.0) ) THEN
          RETURN
       END IF
 *

From ad87d627487a2647ee782b3948ceeba8733bee68 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 2 Oct 2021 16:27:34 +0200
Subject: [PATCH 142/143] Update Alpine version

---
 azure-pipelines.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 5d4a1ecd3..f9e79018b 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -206,8 +206,9 @@ jobs:
      vmImage: 'ubuntu-latest'
   steps:
   - script: |
-        wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
-          && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251  alpine-chroot-install' | sha1sum -c || exit 1
+        wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \
+         && echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74  alpine-chroot-install' | sha1sum -c \
+         || exit 1
         alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
         sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
         alpine make DYNAMIC_ARCH=1 BINARY=64

From 5a468ae87a44f4eee356d629d0826bed0a5a5f46 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 2 Oct 2021 19:25:58 +0200
Subject: [PATCH 143/143] Update Changelog for 0.3.18 (#3388)

* Update Changelog for 0.3.18
---
 Changelog.txt | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/Changelog.txt b/Changelog.txt
index ee0484e2b..59fe1d45e 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,47 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.18
+ 02-Oct-2021
+
+general:
+ - when the build-time number of preconfigured threads is exceeded
+   at runtime (typically by an external program calling BLAS functions
+   from a larger number of threads in parallel), OpenBLAS will now 
+   allocate an auxiliary control structure for up to 512 additional
+   threads instead of aborting
+ - added support for Loongson's LoongArch64 cpu architecture
+ - fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON
+ - added support for building OpenBLAS as a CMAKE subproject
+ - added support for building for Windows/ARM64 targets with clang
+ - improved support for building with the IBM xlf compiler
+ - imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV)
+ - imported Reference-LAPACK PR 597 for testsuite compatibility with
+   LLVM's libomp
+
+x86_64:
+ - added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000)
+ - added optimized SBGEMM for Intel Cooper Lake
+ - reinstated the performance patch for AVX512 SGEMV_T with a proper fix
+ - added a workaround for a gcc11 tree-vectorizer bug that caused spurious
+   failures in the test programs for complex BLAS3 when compiling at -O3
+   (the default for cmake "release" builds)
+ - added support for runtime cpu count detection under Haiku OS
+ - worked around a long-standing miscompilation issue of the Haswell DGEMV_T
+   kernel with gcc that could produce NaN output in some corner cases
+
+POWER:
+ - improved performance of DASUM on POWER10
+
+ARMV8:
+ - fixed crashes (use of reserved register x18) on Apple M1 under OSX
+ - fixed building with gcc releases earlier than 5.1
+
+MIPS:
+ - fixed building under BSD
+
+MIPS64:
+ - fixed building under BSD
+
 ====================================================================
 Version 0.3.17
  15-Jul-2021