From fe0e66564ecab9627ba9313ab7c116b586b7cf19 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Jul 2021 22:39:20 +0200 Subject: [PATCH 001/143] Declare N_THREADS as *4 for compatibility of INTERFACE64 builds with LLVM libomp --- lapack-netlib/TESTING/EIG/cchkee.F | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/cchkee.F b/lapack-netlib/TESTING/EIG/cchkee.F index de4aed696..ab54078a3 100644 --- a/lapack-netlib/TESTING/EIG/cchkee.F +++ b/lapack-netlib/TESTING/EIG/cchkee.F @@ -1075,7 +1075,8 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + INTEGER*4 N_THREADS REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. From 2b9443b7e78aa4b5f77e5d4d4cb03205bcdd52fc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Jul 2021 22:40:29 +0200 Subject: [PATCH 002/143] Declare N_THREADS as *4 for compatibility of INTERFACE64 builds with LLVM libomp --- lapack-netlib/TESTING/EIG/dchkee.F | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/dchkee.F b/lapack-netlib/TESTING/EIG/dchkee.F index 00e8eb57f..6399fecef 100644 --- a/lapack-netlib/TESTING/EIG/dchkee.F +++ b/lapack-netlib/TESTING/EIG/dchkee.F @@ -1081,7 +1081,8 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + INTEGER*4 N_THREADS DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. From f4d4abd423ecf998faa70e09847fd99cdac8888a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Jul 2021 22:41:45 +0200 Subject: [PATCH 003/143] Declare N_THREADS as *4 for compatibility of INTERFACE64 builds with LLVM libomp --- lapack-netlib/TESTING/EIG/schkee.F | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/schkee.F b/lapack-netlib/TESTING/EIG/schkee.F index c3f9ca162..5484a7c26 100644 --- a/lapack-netlib/TESTING/EIG/schkee.F +++ b/lapack-netlib/TESTING/EIG/schkee.F @@ -1081,7 +1081,8 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + INTEGER*4 N_THREADS REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. From f176ff90af6b1d16f940575ea2f03edc13e5f444 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Jul 2021 22:42:43 +0200 Subject: [PATCH 004/143] Declare N_THREADS as *4 for compatibility of INTERFACE64 builds with LLVM libomp --- lapack-netlib/TESTING/EIG/zchkee.F | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/zchkee.F b/lapack-netlib/TESTING/EIG/zchkee.F index 908b7d651..7e9144d15 100644 --- a/lapack-netlib/TESTING/EIG/zchkee.F +++ b/lapack-netlib/TESTING/EIG/zchkee.F @@ -1075,7 +1075,8 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + INTEGER*4 N_THREADS DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. From 6e3fbe8ac5a405149ebd6acaad6a4c88d3e07215 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Jul 2021 14:59:15 +0200 Subject: [PATCH 005/143] Update version to 0.3.17.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 37191a42b..0330b2ce7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 17) +set(OpenBLAS_PATCH_VERSION 17.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 38d5b4b1241f60ab533f136b2d8e61eef1f5062e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Jul 2021 15:00:01 +0200 Subject: [PATCH 006/143] Update version to 0.3.17.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 2e0980fa9..7c04a3101 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.17 +VERSION = 0.3.17.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 49bbf330ca592f439a07f24f137e61af1cc9c616 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 18 Jul 2021 22:19:19 +0200 Subject: [PATCH 007/143] Empirical workaround for numpy SVD NaN problem from issue 3318 --- kernel/Makefile.L2 | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 index 888a9b959..ac53c29c3 100644 --- a/kernel/Makefile.L2 +++ b/kernel/Makefile.L2 @@ -1,3 +1,10 @@ +FMAFLAG= +ifndef OLDGCC +ifdef HAVE_FMA3 +FMAFLAG = -mfma +endif +endif + ### GEMV ### ifndef SGEMVNKERNEL @@ -263,7 +270,7 @@ $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KER $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@ $(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) - $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ + $(CC) -c $(CFLAGS) $(FMAFLAG) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ endif $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL) From 30f23be0f94c7041b7e3bb53a4a0236355cdabad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Jul 2021 12:00:03 +0200 Subject: [PATCH 008/143] Rework setting of -mfma to only apply it where necessary --- cmake/cc.cmake | 6 +++--- cmake/system.cmake | 10 +++++----- cmake/utils.cmake | 10 +++++++++- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 76952152b..ac5e455d5 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -124,9 +124,9 @@ if (NOT DYNAMIC_ARCH) if (HAVE_AVX) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") endif () - if (HAVE_FMA3) - set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") - endif () + # if (HAVE_FMA3) + #set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") + #endif () if (HAVE_SSE) set (CCOMMON_OPT "${CCOMMON_OPT} -msse") endif () diff --git a/cmake/system.cmake b/cmake/system.cmake index 34874827c..f8bd6678e 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -186,11 +186,11 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() - if (DEFINED HAVE_FMA3) - if (NOT NO_AVX2) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") - endif() - endif() + # if (DEFINED HAVE_FMA3) + # if (NOT NO_AVX2) + # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") + # endif() + # endif() if (DEFINED HAVE_SSE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") endif() diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 794d73d06..2c1a1c763 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -311,7 +311,15 @@ function(GenerateNamedObjects sources_in) configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY) file(REMOVE ${new_source_file}.tmp) list(APPEND SRC_LIST_OUT ${new_source_file}) - + message (STATUS ${new_source_file}) + if (DEFINED HAVE_FMA3) + if ( ${new_source_file} MATCHES "(s|d?)rot_k.c") + set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") + endif () + if ( ${new_source_file} MATCHES "dgemv_t_k.c") + set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") + endif () + endif () endforeach () endforeach () From 47ba85f314808476c8254779389607f9af60231f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Jul 2021 17:24:15 +0200 Subject: [PATCH 009/143] Fix regex to match kernels suffixed with cpuname too --- cmake/utils.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 2c1a1c763..6b54092ea 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -313,10 +313,10 @@ function(GenerateNamedObjects sources_in) list(APPEND SRC_LIST_OUT ${new_source_file}) message (STATUS ${new_source_file}) if (DEFINED HAVE_FMA3) - if ( ${new_source_file} MATCHES "(s|d?)rot_k.c") + if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c") set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") endif () - if ( ${new_source_file} MATCHES "dgemv_t_k.c") + if ( ${new_source_file} MATCHES "dgemv_t_k.*c") set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") endif () endif () From efbd7c7840f01f6479fb0224ff473c3166eee669 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 23 Jul 2021 13:42:52 +0200 Subject: [PATCH 010/143] GCC did not support -mtune for ARM64 before 5.1 --- Makefile.arm64 | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index c23a0876e..2656a17f9 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -1,4 +1,15 @@ ifneq ($(C_COMPILER), PGI) + +ifneq ($(GCCVERSIONGT4), 1) +CCOMMON_OPT += -march=armv8-a +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a +endif + + +else + + ifeq ($(CORE), ARMV8) CCOMMON_OPT += -march=armv8-a ifneq ($(F_COMPILER), NAG) @@ -138,4 +149,7 @@ FCOMMON_OPT += -march=armv8-a -mtune=emag endif endif endif + endif + +endif \ No newline at end of file From af0a69f355a086d70cc08ccda8bde7a48b3133c4 Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 26 Jul 2021 15:44:54 +0800 Subject: [PATCH 011/143] Add support for LOONGARCH64 --- Makefile.loongarch64 | 3 + Makefile.system | 12 + TargetList.txt | 2 + c_check | 53 +- common.h | 6 +- common_loongarch64.h | 199 ++ common_macro.h | 3 +- cpuid_loongarch64.c | 110 + ctest.c | 4 + getarch.c | 24 +- kernel/loongarch64/KERNEL | 236 ++ kernel/loongarch64/KERNEL.LOONGSON3R5 | 1 + kernel/loongarch64/KERNEL.generic | 167 ++ kernel/loongarch64/Makefile | 1 + kernel/loongarch64/amax.S | 230 ++ kernel/loongarch64/amin.S | 186 ++ kernel/loongarch64/asum.S | 232 ++ kernel/loongarch64/cnrm2.S | 159 ++ kernel/loongarch64/copy.S | 225 ++ kernel/loongarch64/dnrm2.S | 314 +++ kernel/loongarch64/dot.S | 391 ++++ kernel/loongarch64/gemm_kernel.S | 1859 ++++++++++++++++ kernel/loongarch64/gemv_n.S | 531 +++++ kernel/loongarch64/gemv_t.S | 436 ++++ kernel/loongarch64/iamax.S | 233 ++ kernel/loongarch64/iamin.S | 233 ++ kernel/loongarch64/izamax.S | 217 ++ kernel/loongarch64/izamin.S | 217 ++ kernel/loongarch64/max.S | 174 ++ kernel/loongarch64/min.S | 174 ++ kernel/loongarch64/scal.S | 330 +++ kernel/loongarch64/snrm2.S | 249 +++ kernel/loongarch64/swap.S | 330 +++ kernel/loongarch64/trsm_kernel_LN.S | 2863 +++++++++++++++++++++++++ kernel/loongarch64/trsm_kernel_LT.S | 2854 ++++++++++++++++++++++++ kernel/loongarch64/trsm_kernel_RT.S | 2850 ++++++++++++++++++++++++ kernel/loongarch64/zamax.S | 190 ++ kernel/loongarch64/zamin.S | 198 ++ kernel/loongarch64/zasum.S | 158 ++ kernel/loongarch64/zcopy.S | 217 ++ kernel/loongarch64/zdot.S | 330 +++ kernel/loongarch64/zgemm3m_kernel.S | 1359 ++++++++++++ kernel/loongarch64/zgemm_kernel.S | 1047 +++++++++ kernel/loongarch64/zgemv_n.S | 648 ++++++ kernel/loongarch64/zgemv_t.S | 556 +++++ kernel/loongarch64/znrm2.S | 304 +++ kernel/loongarch64/zscal.S | 356 +++ kernel/loongarch64/ztrsm_kernel_LT.S | 1344 ++++++++++++ kernel/loongarch64/ztrsm_kernel_RT.S | 1343 ++++++++++++ lapack/laswp/loongarch64/Makefile | 12 + param.h | 46 + 51 files changed, 24189 insertions(+), 27 deletions(-) create mode 100644 Makefile.loongarch64 create mode 100644 common_loongarch64.h create mode 100644 cpuid_loongarch64.c create mode 100644 kernel/loongarch64/KERNEL create mode 100644 kernel/loongarch64/KERNEL.LOONGSON3R5 create mode 100644 kernel/loongarch64/KERNEL.generic create mode 100644 kernel/loongarch64/Makefile create mode 100644 kernel/loongarch64/amax.S create mode 100644 kernel/loongarch64/amin.S create mode 100644 kernel/loongarch64/asum.S create mode 100644 kernel/loongarch64/cnrm2.S create mode 100644 kernel/loongarch64/copy.S create mode 100644 kernel/loongarch64/dnrm2.S create mode 100644 kernel/loongarch64/dot.S create mode 100644 kernel/loongarch64/gemm_kernel.S create mode 100644 kernel/loongarch64/gemv_n.S create mode 100644 kernel/loongarch64/gemv_t.S create mode 100644 kernel/loongarch64/iamax.S create mode 100644 kernel/loongarch64/iamin.S create mode 100644 kernel/loongarch64/izamax.S create mode 100644 kernel/loongarch64/izamin.S create mode 100644 kernel/loongarch64/max.S create mode 100644 kernel/loongarch64/min.S create mode 100644 kernel/loongarch64/scal.S create mode 100644 kernel/loongarch64/snrm2.S create mode 100644 kernel/loongarch64/swap.S create mode 100644 kernel/loongarch64/trsm_kernel_LN.S create mode 100644 kernel/loongarch64/trsm_kernel_LT.S create mode 100644 kernel/loongarch64/trsm_kernel_RT.S create mode 100644 kernel/loongarch64/zamax.S create mode 100644 kernel/loongarch64/zamin.S create mode 100644 kernel/loongarch64/zasum.S create mode 100644 kernel/loongarch64/zcopy.S create mode 100644 kernel/loongarch64/zdot.S create mode 100644 kernel/loongarch64/zgemm3m_kernel.S create mode 100644 kernel/loongarch64/zgemm_kernel.S create mode 100644 kernel/loongarch64/zgemv_n.S create mode 100644 kernel/loongarch64/zgemv_t.S create mode 100644 kernel/loongarch64/znrm2.S create mode 100644 kernel/loongarch64/zscal.S create mode 100644 kernel/loongarch64/ztrsm_kernel_LT.S create mode 100644 kernel/loongarch64/ztrsm_kernel_RT.S create mode 100644 lapack/laswp/loongarch64/Makefile diff --git a/Makefile.loongarch64 b/Makefile.loongarch64 new file mode 100644 index 000000000..05ea9c679 --- /dev/null +++ b/Makefile.loongarch64 @@ -0,0 +1,3 @@ +ifdef BINARY64 +else +endif diff --git a/Makefile.system b/Makefile.system index bb8c60e91..4084390db 100644 --- a/Makefile.system +++ b/Makefile.system @@ -780,6 +780,11 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif +ifeq ($(ARCH), loongarch64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + # # C Compiler dependent settings @@ -850,6 +855,13 @@ ifeq ($(OSNAME), AIX) BINARY_DEFINED = 1 endif +ifeq ($(ARCH), loongarch64) +ifeq ($(CORE), LOONGSONG3R5) +CCOMMON_OPT += -march=loongarch64 -mabi=lp64 +FCOMMON_OPT += -march=loongarch64 -mabi=lp64 +endif +endif + endif ifndef BINARY_DEFINED diff --git a/TargetList.txt b/TargetList.txt index f93a629d8..963545cdd 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -110,3 +110,5 @@ Z14 RISCV64_GENERIC C910V +11.LOONGARCH64: +LOONGSON3R5 diff --git a/c_check b/c_check index e24943a29..030f5e632 100644 --- a/c_check +++ b/c_check @@ -82,18 +82,19 @@ $os = Interix if ($data =~ /OS_INTERIX/); $os = Android if ($data =~ /OS_ANDROID/); $os = Haiku if ($data =~ /OS_HAIKU/); -$architecture = x86 if ($data =~ /ARCH_X86/); -$architecture = x86_64 if ($data =~ /ARCH_X86_64/); -$architecture = power if ($data =~ /ARCH_POWER/); -$architecture = mips if ($data =~ /ARCH_MIPS/); -$architecture = mips64 if ($data =~ /ARCH_MIPS64/); -$architecture = alpha if ($data =~ /ARCH_ALPHA/); -$architecture = sparc if ($data =~ /ARCH_SPARC/); -$architecture = ia64 if ($data =~ /ARCH_IA64/); -$architecture = arm if ($data =~ /ARCH_ARM/); -$architecture = arm64 if ($data =~ /ARCH_ARM64/); -$architecture = zarch if ($data =~ /ARCH_ZARCH/); -$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); +$architecture = x86 if ($data =~ /ARCH_X86/); +$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = power if ($data =~ /ARCH_POWER/); +$architecture = mips if ($data =~ /ARCH_MIPS/); +$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +$architecture = alpha if ($data =~ /ARCH_ALPHA/); +$architecture = sparc if ($data =~ /ARCH_SPARC/); +$architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); +$architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); +$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); $defined = 0; @@ -143,6 +144,11 @@ if ($architecture eq "riscv64") { $binary = 64; } +if ($architecture eq "loongarch64") { + $defined = 1; + $binary = 64; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); @@ -215,17 +221,18 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } } -$architecture = x86 if ($data =~ /ARCH_X86/); -$architecture = x86_64 if ($data =~ /ARCH_X86_64/); -$architecture = power if ($data =~ /ARCH_POWER/); -$architecture = mips if ($data =~ /ARCH_MIPS/); -$architecture = mips64 if ($data =~ /ARCH_MIPS64/); -$architecture = alpha if ($data =~ /ARCH_ALPHA/); -$architecture = sparc if ($data =~ /ARCH_SPARC/); -$architecture = ia64 if ($data =~ /ARCH_IA64/); -$architecture = arm if ($data =~ /ARCH_ARM/); -$architecture = arm64 if ($data =~ /ARCH_ARM64/); -$architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = x86 if ($data =~ /ARCH_X86/); +$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = power if ($data =~ /ARCH_POWER/); +$architecture = mips if ($data =~ /ARCH_MIPS/); +$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +$architecture = alpha if ($data =~ /ARCH_ALPHA/); +$architecture = sparc if ($data =~ /ARCH_SPARC/); +$architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); +$architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); diff --git a/common.h b/common.h index ac795937c..ff5254a5c 100644 --- a/common.h +++ b/common.h @@ -449,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips.h" #endif - + #ifdef ARCH_RISCV64 #include "common_riscv64.h" #endif @@ -470,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_zarch.h" #endif +#ifdef ARCH_LOONGARCH64 +#include "common_loongarch64.h" +#endif + #ifndef ASSEMBLER #ifdef OS_WINDOWSSTORE typedef char env_var_t[MAX_PATH]; diff --git a/common_loongarch64.h b/common_loongarch64.h new file mode 100644 index 000000000..959e7e58a --- /dev/null +++ b/common_loongarch64.h @@ -0,0 +1,199 @@ +/***************************************************************************** +Copyright (c) 2011-2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_LOONGARCH64 +#define COMMON_LOONGARCH64 + +#define MB __sync_synchronize() +#define WMB __sync_synchronize() +#define RMB __sync_synchronize() + +#define INLINE inline + +#ifndef ASSEMBLER + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#ifdef DOUBLE +#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#else + +#ifdef DOUBLE +#define LD fld.d +#define ST fst.d +#define MADD fmadd.d +#define NMADD fnmadd.d +#define MSUB fmsub.d +#define NMSUB fnmsub.d +#define ADD fadd.d +#define SUB fsub.d +#define MUL fmul.d +#define MOV fmov.d +#define CMOVT fsel +#define MTC movgr2fr.d +#define FABS fabs.d +#define CMPEQ fcmp.ceq.d +#define CMPLE fcmp.cle.d +#define CMPLT fcmp.clt.d +#define NEG fneg.d +#else +#define LD fld.s +#define ST fst.s +#define MADD fmadd.s +#define NMADD fnmadd.s +#define MSUB fmsub.s +#define NMSUB fnmsub.s +#define ADD fadd.s +#define SUB fsub.s +#define MUL fmul.s +#define MOV fmov.s +#define CMOVT fsel +#define MTC movgr2fr.w +#define FABS fabs.s +#define CMPEQ fcmp.ceq.s +#define CMPLE fcmp.cle.s +#define CMPLT fcmp.clt.s +#define NEG fneg.s +#endif /* defined(DOUBLE) */ + +#if defined(__64BIT__) && defined(USE64BITINT) +#define LDINT ld.d +#define LDARG ld.d +#define SDARG st.d +#elif defined(__64BIT__) && !defined(USE64BITINT) +#define LDINT ld.w +#define LDARG ld.d +#define SDARG st.d +#else +#define LDINT ld.w +#define LDARG ld.w +#define SDARG st.w +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif /* defined(F_INTERFACE) */ + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .text ;\ + .align 5 ;\ + .globl REALNAME ;\ + .type REALNAME, @function ;\ +REALNAME: ;\ + +#if defined(__linux__) && defined(__ELF__) +#define GNUSTACK .section .note.GNU-stack,"",@progbits +#else +#define GNUSTACK +#endif /* defined(__linux__) && defined(__ELF__) */ + +#define EPILOGUE \ + .end REALNAME ;\ + GNUSTACK + +#define PROFCODE + +#define MOVT(dst, src, cc) \ + bceqz cc, 1f; \ + add.d dst, src, $r0; \ + 1: + +#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */ + +#endif /* defined(ASSEMBLER) */ + +#define SEEK_ADDRESS + +#define BUFFER_SIZE ( 32 << 20) + +#define PAGESIZE (16UL << 1) +#define FIXED_PAGESIZE (16UL << 10) +#define HUGE_PAGESIZE ( 2 << 20) + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/common_macro.h b/common_macro.h index c6ea1bfd9..0136f18ab 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2490,7 +2490,8 @@ #endif #ifndef ASSEMBLER -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ +|| defined(ARCH_LOONGARCH64) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; extern BLASLONG sbgemm_p; diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c new file mode 100644 index 000000000..79b186bf1 --- /dev/null +++ b/cpuid_loongarch64.c @@ -0,0 +1,110 @@ +/***************************************************************************** +Copyright (c) 2011-2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include + +#define CPU_UNKNOWN 0 +#define CPU_LOONGSON3R5 1 + +#define LOONGARCH_CFG2 0x02 +#define LOONGARCH_LASX 1<<7 + +static char *cpuname[] = { + "UNKNOWN", + "LOONGSON3R5" +}; + +int detect(void) { + uint32_t reg = 0; + + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg) + : "r"(LOONGARCH_CFG2) + ); + + if (reg & LOONGARCH_LASX) + return CPU_LOONGSON3R5; + else + return CPU_UNKNOWN; +} + +char *get_corename(void) { + return cpuname[detect()]; +} + +void get_architecture(void) { + printf("LOONGARCH64"); +} + +void get_subarchitecture(void) { + if (detect() == CPU_LOONGSON3R5) { + printf("LOONGSON3R5"); + } else { + printf("UNKNOWN"); + } +} + +void get_subdirname(void) { + printf("loongarch64"); +} + +void get_cpuconfig(void) { + if (detect() == CPU_LOONGSON3R5) { + printf("#define LOONGSON3R5\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + } else { + printf("#define LOONGSON3R5\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + } +} + +void get_libname(void){ + if (detect() == CPU_LOONGSON3R5) { + printf("loongson3r5\n"); + } else { + printf("loongarch64\n"); + } +} diff --git a/ctest.c b/ctest.c index d674a8cbd..4f18918f5 100644 --- a/ctest.c +++ b/ctest.c @@ -157,6 +157,10 @@ ARCH_ARM64 ARCH_RISCV64 #endif +#ifdef __loongarch64 +ARCH_LOONGARCH64 +#endif + #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) HAVE_C11 #endif diff --git a/getarch.c b/getarch.c index 3bc8a0c3d..6e43616f7 100644 --- a/getarch.c +++ b/getarch.c @@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3R3 */ /* #define FORCE_LOONGSON3R4 */ +/* #define FORCE_LOONGSON3R5 */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ @@ -842,6 +843,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_LOONGSON3R5 +#define FORCE +#define ARCHITECTURE "LOONGARCH" +#define SUBARCHITECTURE "LOONGSON3R5" +#define SUBDIRNAME "loongarch64" +#define ARCHCONFIG "-DLOONGSON3R5 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " +#define LIBNAME "loongson3r5" +#define CORENAME "LOONGSON3R5" +#else +#endif + #ifdef FORCE_I6400 #define FORCE #define ARCHITECTURE "MIPS" @@ -1388,6 +1403,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __loongarch64 +#include "cpuid_loongarch64.c" +#define OPENBLAS_SUPPORTED +#endif + #ifdef __riscv #include "cpuid_riscv64.c" #define OPENBLAS_SUPPORTED @@ -1463,7 +1483,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -1611,7 +1631,7 @@ printf("ELF_VERSION=2\n"); #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL new file mode 100644 index 000000000..e96a90e72 --- /dev/null +++ b/kernel/loongarch64/KERNEL @@ -0,0 +1,236 @@ +ifndef SAXPYKERNEL +SAXPYKERNEL = ../arm/axpy.c +endif + +ifndef DAXPYKERNEL +DAXPYKERNEL = ../arm/axpy.c +endif + +ifndef CAXPYKERNEL +CAXPYKERNEL = ../arm/zaxpy.c +endif + +ifndef ZAXPYKERNEL +ZAXPYKERNEL = ../arm/zaxpy.c +endif + +ifndef SROTKERNEL +SROTKERNEL = ../arm/rot.c +endif + +ifndef DROTKERNEL +DROTKERNEL = ../arm/rot.c +endif + +ifndef CROTKERNEL +CROTKERNEL = ../arm/zrot.c +endif + +ifndef ZROTKERNEL +ZROTKERNEL = ../arm/zrot.c +endif + +ifndef CSWAPKERNEL +CSWAPKERNEL = ../arm/zswap.c +endif + +ifndef ZSWAPKERNEL +ZSWAPKERNEL = ../arm/zswap.c +endif + +ifndef SSUMKERNEL +SSUMKERNEL = ../arm/sum.c +endif + +ifndef DSUMKERNEL +DSUMKERNEL = ../arm/sum.c +endif + +ifndef CSUMKERNEL +CSUMKERNEL = ../arm/zsum.c +endif + +ifndef ZSUMKERNEL +ZSUMKERNEL = ../arm/zsum.c +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = ../arm/imax.c +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = ../arm/imax.c +endif + +ifndef ISMINKERNEL +ISMINKERNEL = ../arm/imin.c +endif + +ifndef IDMINKERNEL +IDMINKERNEL = ../arm/imin.c +endif + +ifndef SNRM2KERNEL +SNRM2KERNEL = snrm2.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = dnrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = cnrm2.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMMKERNEL +SGEMMKERNEL = gemm_kernel.S +SGEMMINCOPY = ../generic/gemm_ncopy_2.c +SGEMMITCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o +endif + +ifndef DGEMMKERNEL +DGEMMKERNEL = gemm_kernel.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o +endif + +ifndef CGEMMKERNEL +CGEMMKERNEL = zgemm_kernel.S +CGEMMINCOPY = ../generic/zgemm_ncopy_1.c +CGEMMITCOPY = ../generic/zgemm_tcopy_1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o +endif + +ifndef ZGEMMKERNEL +ZGEMMKERNEL = zgemm_kernel.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + +ifndef STRSMKERNEL_LN +STRSMKERNEL_LN = trsm_kernel_LN.S +endif + +ifndef STRSMKERNEL_LT +STRSMKERNEL_LT = trsm_kernel_LT.S +endif + +ifndef STRSMKERNEL_RN +STRSMKERNEL_RN = trsm_kernel_LT.S +endif + +ifndef STRSMKERNEL_RT +STRSMKERNEL_RT = trsm_kernel_RT.S +endif + +ifndef DTRSMKERNEL_LN +DTRSMKERNEL_LN = trsm_kernel_LN.S +endif + +ifndef DTRSMKERNEL_LT +DTRSMKERNEL_LT = trsm_kernel_LT.S +endif + +ifndef DTRSMKERNEL_RN +DTRSMKERNEL_RN = trsm_kernel_LT.S +endif + +ifndef DTRSMKERNEL_RT +DTRSMKERNEL_RT = trsm_kernel_RT.S +endif + +ifndef CTRSMKERNEL_LN +CTRSMKERNEL_LN = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_LT +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_RN +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_RT +CTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif + +ifndef ZTRSMKERNEL_LN +ZTRSMKERNEL_LN = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_LT +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_RN +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_RT +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif + +ifndef CGEMM3MKERNEL +CGEMM3MKERNEL = zgemm3m_kernel.S +endif + +ifndef ZGEMM3MKERNEL +ZGEMM3MKERNEL = zgemm3m_kernel.S +endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 new file mode 100644 index 000000000..cce4093e3 --- /dev/null +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -0,0 +1 @@ +#TODO: Add loongarch64 SIMD optimizations diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic new file mode 100644 index 000000000..105b2f6fd --- /dev/null +++ b/kernel/loongarch64/KERNEL.generic @@ -0,0 +1,167 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Pure C for other kernels +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../generic/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/loongarch64/Makefile b/kernel/loongarch64/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/loongarch64/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/kernel/loongarch64/amax.S b/kernel/loongarch64/amax.S new file mode 100644 index 000000000..4b135c522 --- /dev/null +++ b/kernel/loongarch64/amax.S @@ -0,0 +1,230 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r17 +#define TEMP $r18 + +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 + +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 + +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + + LD a1, X, 0 * SIZE + addi.d N, N, -1 + + add.d X, X, INCX + FABS s1, a1 + + FABS s2, a1 + bge $r0, N, .L999 + + FABS s3, a1 + srai.d I, N, 3 + + FABS s4, a1 + bge $r0, I, .L15 + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + + FABS t1, a1 + + CMPLT $fcc0, s1, t1 + + CMOVT s1, s1, t1, $fcc0 + + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin.S b/kernel/loongarch64/amin.S new file mode 100644 index 000000000..ff9978f26 --- /dev/null +++ b/kernel/loongarch64/amin.S @@ -0,0 +1,186 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + FABS s1, a1 + FABS s2, a1 + bge $r0, N, .L999 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 +.L15: + andi I, N, 7 +NOP + bge $r0, I, .L998 + .align 3 +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + .align 3 +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/asum.S b/kernel/loongarch64/asum.S new file mode 100644 index 000000000..e4c717085 --- /dev/null +++ b/kernel/loongarch64/asum.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f23 +#define a2 $f9 +#define a3 $f10 +#define a4 $f11 +#define a5 $f12 +#define a6 $f13 +#define a7 $f14 +#define a8 $f15 +#define t1 $f16 +#define t2 $f17 +#define t3 $f0 +#define t4 $f1 +#define s1 $f22 +#define s2 $f8 + PROLOGUE +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + MTC s1, $r0 + MTC s2, $r0 + slli.d INCX, INCX, BASE_SHIFT + li TEMP, SIZE + bge $r0, N, .L999 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + FABS t1, a1 + LD a6, X, 5 * SIZE + FABS t2, a2 + LD a7, X, 6 * SIZE + FABS t3, a3 + FABS t4, a4 + addi.d I, I, -1 + LD a8, X, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + ADD s1, s1, t1 + LD a1, X, 8 * SIZE + FABS t1, a5 + addi.d I, I, -1 + ADD s2, s2, t2 + LD a2, X, 9 * SIZE + FABS t2, a6 + NOP + ADD s1, s1, t3 + LD a3, X, 10 * SIZE + FABS t3, a7 + NOP + ADD s2, s2, t4 + LD a4, X, 11 * SIZE + FABS t4, a8 + addi.d X, X, 8 * SIZE + ADD s1, s1, t1 + LD a5, X, 4 * SIZE + FABS t1, a1 + NOP + ADD s2, s2, t2 + LD a6, X, 5 * SIZE + FABS t2, a2 + NOP + ADD s1, s1, t3 + LD a7, X, 6 * SIZE + FABS t3, a3 + NOP + ADD s2, s2, t4 + LD a8, X, 7 * SIZE + FABS t4, a4 + blt $r0, I, .L12 + .align 3 +.L13: + ADD s1, s1, t1 + addi.d X, X, 8 * SIZE + FABS t1, a5 + NOP + ADD s2, s2, t2 + FABS t2, a6 + ADD s1, s1, t3 + FABS t3, a7 + ADD s2, s2, t4 + FABS t4, a8 + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + ADD s1, s1, t1 + addi.d X, X, SIZE + blt $r0, I, .L16 + b .L999 + .align 3 +.L20: + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + FABS t1, a1 + LD a7, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a8, X, 0 * SIZE + FABS t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 +.L23: + ADD s1, s1, t1 + LD a1, X, 0 * SIZE + FABS t1, a5 + add.d X, X, INCX + ADD s2, s2, t2 + LD a2, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + ADD s1, s1, t3 + LD a3, X, 0 * SIZE + FABS t3, a7 + add.d X, X, INCX + ADD s2, s2, t4 + LD a4, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + ADD s1, s1, t1 + LD a5, X, 0 * SIZE + FABS t1, a1 + add.d X, X, INCX + ADD s2, s2, t2 + LD a6, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + ADD s1, s1, t3 + LD a7, X, 0 * SIZE + FABS t3, a3 + add.d X, X, INCX + ADD s2, s2, t4 + LD a8, X, 0 * SIZE + FABS t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + .align 3 +.L24: + ADD s1, s1, t1 + FABS t1, a5 + ADD s2, s2, t2 + FABS t2, a6 + ADD s1, s1, t3 + FABS t3, a7 + ADD s2, s2, t4 + FABS t4, a8 + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L26: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + add.d X, X, INCX + ADD s1, s1, t1 + blt $r0, I, .L26 + .align 3 +.L999: + ADD s1, s1, s2 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S new file mode 100644 index 000000000..c4b2555d3 --- /dev/null +++ b/kernel/loongarch64/cnrm2.S @@ -0,0 +1,159 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define a5 $f16 +#define a6 $f17 +#define a7 $f0 +#define a8 $f1 +#define s1 $f22 +#define s2 $f8 +#define t1 $f23 +#define t2 $f9 +#define t3 $f10 +#define t4 $f11 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + movgr2fr.d s1, $r0 + li TEMP, 2 * SIZE + fmov.d s2, s1 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + srai.d I, N, 2 + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + fcvt.d.s t1, a1 + LD a7, X, 0 * SIZE + fcvt.d.s t2, a2 + LD a8, X, 1 * SIZE + fcvt.d.s t3, a3 + addi.d I, I, -1 + fcvt.d.s t4, a4 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 + +.L23: + fmadd.d s1, t1, t1, s1 + LD a1, X, 0 * SIZE + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + LD a2, X, 1 * SIZE + fcvt.d.s t2, a6 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a3, X, 0 * SIZE + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + LD a4, X, 1 * SIZE + fcvt.d.s t4, a8 + add.d X, X, INCX + fmadd.d s1, t1, t1, s1 + LD a5, X, 0 * SIZE + fcvt.d.s t1, a1 + addi.d I, I, -1 + fmadd.d s2, t2, t2, s2 + LD a6, X, 1 * SIZE + fcvt.d.s t2, a2 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a7, X, 0 * SIZE + fcvt.d.s t3, a3 + LD a8, X, 1 * SIZE + fmadd.d s2, t4, t4, s2 + add.d X, X, INCX + fcvt.d.s t4, a4 + blt $r0, I, .L23 + .align 3 + +.L24: + fmadd.d s1, t1, t1, s1 + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + fcvt.d.s t2, a6 + fmadd.d s1, t3, t3, s1 + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + fcvt.d.s t4, a8 + fmadd.d s1, t1, t1, s1 + fmadd.d s2, t2, t2, s2 + fmadd.d s1, t3, t3, s1 + fmadd.d s2, t4, t4, s2 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + fcvt.d.s t2, a2 + fmadd.d s1, t1, t1, s1 + add.d X, X, INCX + fmadd.d s2, t2, t2, s2 + blt $r0, I, .L26 + .align 3 + +.L999: + fadd.d s1, s1, s2 + fsqrt.d s1, s1 + move $r4, $r17 + fcvt.s.d $f0, s1 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/copy.S b/kernel/loongarch64/copy.S new file mode 100644 index 000000000..28b7bce4c --- /dev/null +++ b/kernel/loongarch64/copy.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + li TEMP, SIZE + NOP + slli.d INCX, INCX, BASE_SHIFT + bge $r0, N, .L999 + slli.d INCY, INCY, BASE_SHIFT + bne INCX, TEMP, .L20 + srai.d I, N, 3 + bne INCY, TEMP, .L20 + addi.d I, I, -1 + blt I, $r0, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + LD a7, X, 6 * SIZE + LD a8, X, 7 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: + ST a1, Y, 0 * SIZE + LD a1, X, 8 * SIZE + ST a2, Y, 1 * SIZE + LD a2, X, 9 * SIZE + ST a3, Y, 2 * SIZE + LD a3, X, 10 * SIZE + ST a4, Y, 3 * SIZE + LD a4, X, 11 * SIZE + ST a5, Y, 4 * SIZE + LD a5, X, 12 * SIZE + ST a6, Y, 5 * SIZE + LD a6, X, 13 * SIZE + ST a7, Y, 6 * SIZE + LD a7, X, 14 * SIZE + ST a8, Y, 7 * SIZE + LD a8, X, 15 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, Y, 2 * SIZE + ST a4, Y, 3 * SIZE + ST a5, Y, 4 * SIZE + ST a6, Y, 5 * SIZE + ST a7, Y, 6 * SIZE + ST a8, Y, 7 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d X, X, SIZE + addi.d I, I, -1 + addi.d Y, Y, SIZE + ST a1, Y, -1 * SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: + srai.d I, N, 3 + addi.d I, I, -1 + blt I, $r0, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + add.d X, X, INCX + bge $r0, I, .L23 + .align 3 + +.L22: + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + LD a1, X, 0 * SIZE + add.d X, X, INCX + ST a2, Y, 0 * SIZE + add.d Y, Y, INCY + LD a2, X, 0 * SIZE + add.d X, X, INCX + ST a3, Y, 0 * SIZE + add.d Y, Y, INCY + LD a3, X, 0 * SIZE + add.d X, X, INCX + ST a4, Y, 0 * SIZE + add.d Y, Y, INCY + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST a5, Y, 0 * SIZE + add.d Y, Y, INCY + LD a5, X, 0 * SIZE + add.d X, X, INCX + ST a6, Y, 0 * SIZE + add.d Y, Y, INCY + LD a6, X, 0 * SIZE + add.d X, X, INCX + ST a7, Y, 0 * SIZE + add.d Y, Y, INCY + LD a7, X, 0 * SIZE + add.d X, X, INCX + ST a8, Y, 0 * SIZE + add.d Y, Y, INCY + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L23: + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + ST a2, Y, 0 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + add.d Y, Y, INCY + ST a4, Y, 0 * SIZE + add.d Y, Y, INCY + ST a5, Y, 0 * SIZE + add.d Y, Y, INCY + ST a6, Y, 0 * SIZE + add.d Y, Y, INCY + ST a7, Y, 0 * SIZE + add.d Y, Y, INCY + ST a8, Y, 0 * SIZE + add.d Y, Y, INCY + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L26 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S new file mode 100644 index 000000000..41db48bdf --- /dev/null +++ b/kernel/loongarch64/dnrm2.S @@ -0,0 +1,314 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r7 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define ALPHA $f4 +#define max $f5 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + move XX, X + NOP + LD a1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + FABS s1, a1 + FABS s2, a1 + bge $r0, N, .L999 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L100 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L100: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + addi.d N, N, 1 + lu12i.w TEMP, 0x3f800 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, s1, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, s1 + MOV max, s1 + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 + MOV s4, a1 + srai.d I, N, 3 + bge $r0, I, .L105 + LD a1, XX, 0 * SIZE + add.d XX, XX, INCX + LD a2, XX, 0 * SIZE + add.d XX, XX, INCX + LD a3, XX, 0 * SIZE + add.d XX, XX, INCX + LD a4, XX, 0 * SIZE + add.d XX, XX, INCX + LD a5, XX, 0 * SIZE + add.d XX, XX, INCX + LD a6, XX, 0 * SIZE + add.d XX, XX, INCX + LD a7, XX, 0 * SIZE + add.d XX, XX, INCX + LD a8, XX, 0 * SIZE + addi.d I, I, -1 + add.d XX, XX, INCX + bge $r0, I, .L104 + .align 3 + +.L103: + MUL t1, ALPHA, a1 + LD a1, XX, 0 * SIZE + MUL t2, ALPHA, a2 + add.d XX, XX, INCX + MUL t3, ALPHA, a3 + LD a2, XX, 0 * SIZE + MUL t4, ALPHA, a4 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a3, XX, 0 * SIZE + MADD s2, t2, t2, s2 + add.d XX, XX, INCX + MADD s3, t3, t3, s3 + LD a4, XX, 0 * SIZE + MADD s4, t4, t4, s4 + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + LD a5, XX, 0 * SIZE + MUL t2, ALPHA, a6 + add.d XX, XX, INCX + MUL t3, ALPHA, a7 + LD a6, XX, 0 * SIZE + MUL t4, ALPHA, a8 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a7, XX, 0 * SIZE + MADD s2, t2, t2, s2 + add.d XX, XX, INCX + MADD s3, t3, t3, s3 + LD a8, XX, 0 * SIZE + MADD s4, t4, t4, s4 + addi.d I, I, -1 + add.d XX, XX, INCX + blt $r0, I, .L103 + .align 3 + +.L104: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + .align 3 + +.L105: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L106: + LD a1, XX, 0 * SIZE + addi.d I, I, -1 + MUL t1, ALPHA, a1 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + blt $r0, I, .L106 + .align 3 + +.L998: + ADD s1, s1, s2 + ADD s3, s3, s4 + ADD s1, s1, s3 + fsqrt.d s1, s1 + move $r4, $r17 + MUL $f0, max, s1 + jirl $r0, $r1, 0x0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dot.S b/kernel/loongarch64/dot.S new file mode 100644 index 000000000..4fcd569c8 --- /dev/null +++ b/kernel/loongarch64/dot.S @@ -0,0 +1,391 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f23 +#define a2 $f9 +#define a3 $f10 +#define a4 $f11 +#define b1 $f12 +#define b2 $f13 +#define b3 $f14 +#define b4 $f15 +#define s1 $f22 +#define s2 $f8 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + MTC s1, $r0 + MTC s2, $r0 + slli.d INCX, INCX, BASE_SHIFT + li TEMP, SIZE + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + LD a2, X, 1 * SIZE + LD b2, Y, 1 * SIZE + LD a3, X, 2 * SIZE + LD b3, Y, 2 * SIZE + LD a4, X, 3 * SIZE + addi.d I, I, -1 + LD b4, Y, 3 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 4 * SIZE + LD b1, Y, 4 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + LD a2, X, 5 * SIZE + LD b2, Y, 5 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif + LD a3, X, 6 * SIZE + LD b3, Y, 6 * SIZE +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + LD a4, X, 7 * SIZE + LD b4, Y, 7 * SIZE +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 8 * SIZE + LD b1, Y, 8 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + LD a2, X, 9 * SIZE + LD b2, Y, 9 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif + LD a3, X, 10 * SIZE + LD b3, Y, 10 * SIZE +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + LD a4, X, 11 * SIZE + LD b4, Y, 11 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE +addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 +.L13: +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 4 * SIZE + LD b1, Y, 4 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + LD a2, X, 5 * SIZE + LD b2, Y, 5 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif + LD a3, X, 6 * SIZE + LD b3, Y, 6 * SIZE +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + LD a4, X, 7 * SIZE + LD b4, Y, 7 * SIZE +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + addi.d X, X, 8 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + addi.d Y, Y, 8 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + .align 3 +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + addi.d I, I, -1 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: +#ifdef F_INTERFACE + bgez INCX, .L21 + addi.d TEMP, N, -1 + mult TEMP, INCX + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + addi.d TEMP, N, -1 + mult TEMP, INCY + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + bge $r0, I, .L25 + .align 3 + +.L23: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + blt $r0, I, .L23 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + blt $r0, I, .L26 + .align 3 + +.L999: +#ifdef DSDOT + fadd.d $f0, s1, s2 +#else + ADD $f0, s1, s2 +#endif + move $r4, $r17 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/gemm_kernel.S b/kernel/loongarch64/gemm_kernel.S new file mode 100644 index 000000000..8926bf123 --- /dev/null +++ b/kernel/loongarch64/gemm_kernel.S @@ -0,0 +1,1859 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r30 +#define PREFETCHSIZE (4 * 10) +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define BB $r29 + +#if defined(TRMMKERNEL) +#define OFFSET $r11 +#define KK $r20 +#define TEMP $r16 +#endif + +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -160 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + SDARG $r30, $sp, 96 + fst.d $f24, $sp, 56 + fst.d $f25, $sp, 64 + fst.d $f26, $sp, 72 + fst.d $f27, $sp, 80 + fst.d $f28, $sp, 88 +#if defined(TRMMKERNEL) + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#endif +#ifndef __64BIT__ + fst.d $f18, $sp, 120 + fst.d $f19, $sp, 128 + fst.d $f20, $sp, 136 + fst.d $f21, $sp, 144 +#endif + slli.d LDC, LDC, BASE_SHIFT +#if defined(TRMMKERNEL) && !defined(LEFT) + sub.d KK, $r0, OFFSET +#endif + srai.d J, N, 3 +nop + bge $r0, J, .L30 +.L10: + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + move AO, A + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 + add.d C, CO8, LDC + slli.d BB, K, 2 + BASE_SHIFT + add.d BB, B, BB +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif +MOV c61, c11 + bge $r0, I, .L20 +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 8 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#else + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + preld 1, CO1, 3 * SIZE + preld 1, CO2, 3 * SIZE + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, K, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + preld 1, CO3, 2 * SIZE + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD a4, AO, 2 * SIZE + MADD c61, b2, a1, c61 + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD a4, AO, 6 * SIZE + MADD c61, b2, a3, c61 + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + preld 1, CO4, 3 * SIZE + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + preld 1, CO5, 3 * SIZE + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + preld 1, CO6, 3 * SIZE + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + preld 1, CO7, 3 * SIZE + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + preld 1, CO8, 3 * SIZE + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d CO3,CO3, 2 * SIZE + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + LD $f23, CO2, 0 * SIZE + addi.d CO4,CO4, 2 * SIZE + LD $f9, CO2, 1 * SIZE + addi.d CO2,CO2, 2 * SIZE + LD $f10, CO3, -2 * SIZE + addi.d CO5,CO5, 2 * SIZE + LD $f11, CO3, -1 * SIZE + addi.d CO6,CO6, 2 * SIZE + LD $f12, CO4, -2 * SIZE + addi.d CO7,CO7, 2 * SIZE + LD $f13, CO4, -1 * SIZE + addi.d I, I, -1 + MADD c11, c11, ALPHA, $f22 + LD $f22, CO5, -2 * SIZE + MADD c12, c12, ALPHA, $f8 + LD $f8, CO5, -1 * SIZE + MADD c21, c21, ALPHA, $f23 + LD $f23, CO6, -2 * SIZE + MADD c22, c22, ALPHA, $f9 + LD $f9, CO6, -1 * SIZE + MADD c31, c31, ALPHA, $f10 + LD $f10, CO7, -2 * SIZE + MADD c32, c32, ALPHA, $f11 + LD $f11, CO7, -1 * SIZE + MADD c41, c41, ALPHA, $f12 + LD $f12, CO8, 0 * SIZE + MADD c42, c42, ALPHA, $f13 + LD $f13, CO8, 1 * SIZE + preld 0, BB, 0 * SIZE + preld 0, BB, 8 * SIZE + ST c11, CO1, -2 * SIZE + MTC c11, $r0 + ST c12, CO1, -1 * SIZE + addi.d CO8,CO8, 2 * SIZE + ST c21, CO2, -2 * SIZE + MOV c21, c11 + ST c22, CO2, -1 * SIZE + addi.d BB, BB, 16 * SIZE + MADD c51, c51, ALPHA, $f22 + ST c31, CO3, -2 * SIZE + MADD c52, c52, ALPHA, $f8 + ST c32, CO3, -1 * SIZE + MADD c61, c61, ALPHA, $f23 + ST c41, CO4, -2 * SIZE + MADD c62, c62, ALPHA, $f9 + ST c42, CO4, -1 * SIZE + MADD c71, c71, ALPHA, $f10 + ST c51, CO5, -2 * SIZE + MADD c72, c72, ALPHA, $f11 + ST c52, CO5, -1 * SIZE + MADD c81, c81, ALPHA, $f12 + ST c61, CO6, -2 * SIZE + MADD c82, c82, ALPHA, $f13 + ST c62, CO6, -1 * SIZE + ST c71, CO7, -2 * SIZE + MOV c31, c11 + ST c72, CO7, -1 * SIZE + MOV c41, c11 + ST c81, CO8, -2 * SIZE + MOV c51, c11 + ST c82, CO8, -1 * SIZE +MOV c61, c11 + blt $r0, I, .L11 +#else + addi.d CO4,CO4, 2 * SIZE + addi.d CO5,CO5, 2 * SIZE + addi.d CO6,CO6, 2 * SIZE + addi.d CO7,CO7, 2 * SIZE + preld 0, BB, 0 * SIZE + preld 0, BB, 8 * SIZE + MUL c11, ALPHA, c11 + addi.d CO1,CO1, 2 * SIZE + MUL c12, ALPHA, c12 + MTC a1, $r0 + MUL c21, ALPHA, c21 + addi.d CO2,CO2, 2 * SIZE + MUL c22, ALPHA, c22 + addi.d CO3,CO3, 2 * SIZE + ST c11, CO1, -2 * SIZE + MUL c31, ALPHA, c31 + ST c12, CO1, -1 * SIZE + MUL c32, ALPHA, c32 + ST c21, CO2, -2 * SIZE + MUL c41, ALPHA, c41 + ST c22, CO2, -1 * SIZE + MUL c42, ALPHA, c42 + ST c31, CO3, -2 * SIZE + MUL c51, ALPHA, c51 + ST c32, CO3, -1 * SIZE + MUL c52, ALPHA, c52 + ST c41, CO4, -2 * SIZE + MUL c61, ALPHA, c61 + ST c42, CO4, -1 * SIZE + MUL c62, ALPHA, c62 + ST c51, CO5, -2 * SIZE + MUL c71, ALPHA, c71 + ST c52, CO5, -1 * SIZE + MUL c72, ALPHA, c72 + ST c61, CO6, -2 * SIZE + MUL c81, ALPHA, c81 + ST c62, CO6, -1 * SIZE + MUL c82, ALPHA, c82 + ST c71, CO7, -2 * SIZE + MOV c11, a1 + ST c72, CO7, -1 * SIZE + MOV c21, a1 + addi.d CO8,CO8, 2 * SIZE + addi.d BB, BB, 16 * SIZE + ST c81, CO8, -2 * SIZE + MOV c31, a1 + ST c82, CO8, -1 * SIZE + MOV c41, a1 + addi.d I, I, -1 + MOV c51, a1 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif +MOV c61, a1 + blt $r0, I, .L11 +#endif + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 8 +#endif + srai.d L, TEMP, 2 +MOV c81, c11 + bge $r0, L, .L25 +#else + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + LD $f8, CO2, 0 * SIZE + LD $f23, CO3, 0 * SIZE + LD $f9, CO4, 0 * SIZE + MADD c11, c11, ALPHA, $f22 + LD $f10, CO5, 0 * SIZE + MADD c21, c21, ALPHA, $f8 + LD $f11, CO6, 0 * SIZE + MADD c31, c31, ALPHA, $f23 + LD $f12, CO7, 0 * SIZE + MADD c41, c41, ALPHA, $f9 + LD $f13, CO8, 0 * SIZE + MADD c51, c51, ALPHA, $f10 + ST c11, CO1, 0 * SIZE + MADD c61, c61, ALPHA, $f11 + ST c21, CO2, 0 * SIZE + MADD c71, c71, ALPHA, $f12 + ST c31, CO3, 0 * SIZE + MADD c81, c81, ALPHA, $f13 + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#else + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + MUL c31, ALPHA, c31 + MUL c41, ALPHA, c41 + ST c11, CO1, 0 * SIZE + MUL c51, ALPHA, c51 + ST c21, CO2, 0 * SIZE + MUL c61, ALPHA, c61 + ST c31, CO3, 0 * SIZE + MUL c71, ALPHA, c71 + ST c41, CO4, 0 * SIZE + MUL c81, ALPHA, c81 + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + .align 3 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 8 +#endif +move B, BO + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + add.d C, CO4, LDC + MOV c31, c11 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + srai.d I, M, 1 +MOV c41, c11 + bge $r0, I, .L40 +.L31: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 4 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L35 +#else + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d CO3,CO3, 2 * SIZE + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + LD $f23, CO2, 0 * SIZE + addi.d CO4,CO4, 2 * SIZE + LD $f9, CO2, 1 * SIZE + addi.d CO2,CO2, 2 * SIZE + LD $f10, CO3, -2 * SIZE + MADD c11, c11, ALPHA, $f22 + LD $f11, CO3, -1 * SIZE + MADD c12, c12, ALPHA, $f8 + LD $f12, CO4, -2 * SIZE + MADD c21, c21, ALPHA, $f23 + LD $f13, CO4, -1 * SIZE + MADD c22, c22, ALPHA, $f9 + MADD c31, c31, ALPHA, $f10 + ST c11, CO1, -2 * SIZE + MADD c32, c32, ALPHA, $f11 + ST c12, CO1, -1 * SIZE + MADD c41, c41, ALPHA, $f12 + ST c21, CO2, -2 * SIZE + MADD c42, c42, ALPHA, $f13 + ST c22, CO2, -1 * SIZE + ST c31, CO3, -2 * SIZE + MTC c11, $r0 + ST c32, CO3, -1 * SIZE + addi.d I, I, -1 + ST c41, CO4, -2 * SIZE + MOV c21, c11 + ST c42, CO4, -1 * SIZE + MOV c31, c11 +#else + MUL c11, ALPHA, c11 + addi.d CO3,CO3, 2 * SIZE + MUL c12, ALPHA, c12 + addi.d CO1,CO1, 2 * SIZE + MUL c21, ALPHA, c21 + addi.d CO4,CO4, 2 * SIZE + MUL c22, ALPHA, c22 + addi.d CO2,CO2, 2 * SIZE + ST c11, CO1, -2 * SIZE + MUL c31, ALPHA, c31 + ST c12, CO1, -1 * SIZE + MUL c32, ALPHA, c32 + ST c21, CO2, -2 * SIZE + MUL c41, ALPHA, c41 + ST c22, CO2, -1 * SIZE + MUL c42, ALPHA, c42 + ST c31, CO3, -2 * SIZE + MTC c11, $r0 + ST c32, CO3, -1 * SIZE + addi.d I, I, -1 + ST c41, CO4, -2 * SIZE + MOV c21, c11 + ST c42, CO4, -1 * SIZE + MOV c31, c11 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif +#endif +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 4 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#else + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + LD $f8, CO2, 0 * SIZE + LD $f23, CO3, 0 * SIZE + LD $f9, CO4, 0 * SIZE + MADD c11, c11, ALPHA, $f22 + MADD c21, c21, ALPHA, $f8 + MADD c31, c31, ALPHA, $f23 + MADD c41, c41, ALPHA, $f9 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#else + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + MUL c31, ALPHA, c31 + MUL c41, ALPHA, c41 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + .align 3 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 4 +#endif + move B, BO + .align 3 + +.L50: + andi J, N, 2 +move AO, A + bge $r0, J, .L70 + move CO1, C + add.d CO2, C, LDC +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + srai.d I, M, 1 +add.d C, CO2, LDC + bge $r0, I, .L60 +.L51: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 2 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L55 +#else + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d I, I, -1 + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + addi.d CO2,CO2, 2 * SIZE + MADD c11, c11, ALPHA, $f22 + MADD c12, c12, ALPHA, $f8 + MADD c21, c21, ALPHA, $f23 + MADD c22, c22, ALPHA, $f9 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE + ST c21, CO2, -2 * SIZE + ST c22, CO2, -1 * SIZE + blt $r0, I, .L51 +#else + addi.d I, I, -1 + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE + MUL c11, ALPHA, c11 + MUL c12, ALPHA, c12 + MUL c21, ALPHA, c21 + MUL c22, ALPHA, c22 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE + ST c21, CO2, -2 * SIZE + ST c22, CO2, -1 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif + blt $r0, I, .L51 +#endif + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 2 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L65 +#else + srai.d L, K, 2 + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + LD $f8, CO2, 0 * SIZE + ADD c11, c11, c31 + ADD c21, c21, c41 + MADD c11, c11, ALPHA, $f22 + MADD c21, c21, ALPHA, $f8 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#else + ADD c11, c11, c31 + ADD c21, c21, c41 + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + .align 3 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 2 +#endif + move B, BO + .align 3 + +.L70: + andi J, N, 1 +move AO, A + bge $r0, J, .L999 + move CO1, C +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + srai.d I, M, 1 +add.d C, CO1, LDC + bge $r0, I, .L80 +.L71: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 1 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L75 +#else + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d I, I, -1 + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + ADD c11, c11, c21 + ADD c12, c12, c22 + MADD c11, c11, ALPHA, $f22 + MADD c12, c12, ALPHA, $f8 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE + blt $r0, I, .L71 +#else + ADD c11, c11, c21 + addi.d I, I, -1 + ADD c12, c12, c22 + addi.d CO1,CO1, 2 * SIZE + MUL c11, ALPHA, c11 + MUL c12, ALPHA, c12 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif + blt $r0, I, .L71 +#endif + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 1 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#else + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + ADD c11, c11, c21 + MADD c11, c11, ALPHA, $f22 + ST c11, CO1, 0 * SIZE +#else + ADD c11, c11, c21 + MUL c11, ALPHA, c11 + ST c11, CO1, 0 * SIZE +#endif + .align 3 + +.L89: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 1 +#endif + move B, BO + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + LDARG $r30, $sp, 96 + fld.d $f24, $sp, 56 + fld.d $f25, $sp, 64 + fld.d $f26, $sp, 72 + fld.d $f27, $sp, 80 + fld.d $f28, $sp, 88 +#if defined(TRMMKERNEL) + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#endif +#ifndef __64BIT__ + fld.d $f18, $sp, 120 + fld.d $f19, $sp, 128 + fld.d $f20, $sp, 136 + fld.d $f21, $sp, 144 +#endif + addi.d $sp, $sp, 160 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/gemv_n.S b/kernel/loongarch64/gemv_n.S new file mode 100644 index 000000000..334a2991f --- /dev/null +++ b/kernel/loongarch64/gemv_n.S @@ -0,0 +1,531 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Unused param dummy1 */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define YORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define ALPHA $f0 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define x1 $f14 +#define x2 $f15 +#define y1 $f16 +#define y2 $f17 +#define y3 $f3 +#define y4 $f1 +#define y5 $f2 +#define y6 $f4 +#define y7 $f5 +#define y8 $f6 +#define t1 $f7 +#define t2 $f18 +#define t3 $f19 +#define t4 $f20 + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifdef __64BIT__ + addi.d $sp, $sp, -16 +#else + addi.d $sp, $sp, -48 +#endif + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + slli.d LDA, LDA, BASE_SHIFT +#ifndef __64BIT__ + fst.d $f18, $sp, 16 + fst.d $f19, $sp, 24 + fst.d $f20, $sp, 32 +#endif + slli.d INCX, INCX, BASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + li I, SIZE + move YORIG, Y + beq INCY, I, .L10 + srai.d I, M, 2 + move YORIG, BUFFER + move XX, Y + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, XX, 0 * SIZE + add.d XX, XX, INCY + LD a2, XX, 0 * SIZE + add.d XX, XX, INCY + LD a3, XX, 0 * SIZE + add.d XX, XX, INCY + LD a4, XX, 0 * SIZE + add.d XX, XX, INCY + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + ST a3, YY, 2 * SIZE + ST a4, YY, 3 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 4 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, XX, 0 * SIZE + add.d XX, XX, INCY + ST a1, YY, 0 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 1 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + bge $r0, J, .L20 + .align 3 + +.L11: + LD x1, X, 0 * SIZE + add.d X, X, INCX + LD x2, X, 0 * SIZE + add.d X, X, INCX + move AO1, A + add.d AO2, A, LDA + add.d A, AO2, LDA + move YY, YORIG + MUL x1, ALPHA, x1 + srai.d I, M, 3 + MUL x2, ALPHA, x2 + bge $r0, I, .L15 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + LD a5, AO2, 0 * SIZE + LD y5, YY, 4 * SIZE + LD a6, AO2, 1 * SIZE + LD y6, YY, 5 * SIZE + LD a7, AO2, 2 * SIZE + LD y7, YY, 6 * SIZE + LD a8, AO2, 3 * SIZE + addi.d I, I, -1 + LD y8, YY, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + LD y1, YY, 8 * SIZE + LD y2, YY, 9 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + LD y3, YY, 10 * SIZE + LD y4, YY, 11 * SIZE + MADD t1, a5, x2, t1 + LD a5, AO2, 4 * SIZE + MADD t2, a6, x2, t2 + LD a6, AO2, 5 * SIZE + MADD t3, a7, x2, t3 + LD a7, AO2, 6 * SIZE + MADD t4, a8, x2, t4 + LD a8, AO2, 7 * SIZE + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + ST t3, YY, 2 * SIZE + ST t4, YY, 3 * SIZE + MADD t1, a1, x1, y5 + LD a1, AO1, 8 * SIZE + MADD t2, a2, x1, y6 + LD a2, AO1, 9 * SIZE + LD y5, YY, 12 * SIZE + LD y6, YY, 13 * SIZE + MADD t3, a3, x1, y7 + LD a3, AO1, 10 * SIZE + MADD t4, a4, x1, y8 + LD a4, AO1, 11 * SIZE + LD y7, YY, 14 * SIZE + LD y8, YY, 15 * SIZE + MADD t1, a5, x2, t1 + LD a5, AO2, 8 * SIZE + MADD t2, a6, x2, t2 + LD a6, AO2, 9 * SIZE + MADD t3, a7, x2, t3 + LD a7, AO2, 10 * SIZE + MADD t4, a8, x2, t4 + LD a8, AO2, 11 * SIZE + ST t1, YY, 4 * SIZE + ST t2, YY, 5 * SIZE + ST t3, YY, 6 * SIZE + ST t4, YY, 7 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d AO2, AO2, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + MADD t1, a5, x2, t1 + LD a5, AO2, 4 * SIZE + MADD t2, a6, x2, t2 + LD a6, AO2, 5 * SIZE + MADD t3, a7, x2, t3 + LD a7, AO2, 6 * SIZE + MADD t4, a8, x2, t4 + LD a8, AO2, 7 * SIZE + ST t1, YY, 0 * SIZE + MADD t1, a1, x1, y5 + ST t2, YY, 1 * SIZE + MADD t2, a2, x1, y6 + ST t3, YY, 2 * SIZE + MADD t3, a3, x1, y7 + ST t4, YY, 3 * SIZE + MADD t4, a4, x1, y8 + MADD t1, a5, x2, t1 + addi.d AO1, AO1, 8 * SIZE + MADD t2, a6, x2, t2 + addi.d AO2, AO2, 8 * SIZE + MADD t3, a7, x2, t3 + addi.d YY, YY, 8 * SIZE + MADD t4, a8, x2, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L15: + andi I, M, 4 + bge $r0, I, .L16 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + LD a5, AO2, 0 * SIZE + MADD y1, a1, x1, y1 + LD a6, AO2, 1 * SIZE + MADD y2, a2, x1, y2 + LD a7, AO2, 2 * SIZE + MADD y3, a3, x1, y3 + LD a8, AO2, 3 * SIZE + MADD y4, a4, x1, y4 + MADD y1, a5, x2, y1 + addi.d YY, YY, 4 * SIZE + MADD y2, a6, x2, y2 + addi.d AO1, AO1, 4 * SIZE + MADD y3, a7, x2, y3 + addi.d AO2, AO2, 4 * SIZE + MADD y4, a8, x2, y4 + ST y1, YY, -4 * SIZE + ST y2, YY, -3 * SIZE + ST y3, YY, -2 * SIZE + ST y4, YY, -1 * SIZE + .align 3 + +.L16: + andi I, M, 2 + bge $r0, I, .L17 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a5, AO2, 0 * SIZE + LD a6, AO2, 1 * SIZE + MADD y1, a1, x1, y1 + MADD y2, a2, x1, y2 + addi.d YY, YY, 2 * SIZE + MADD y1, a5, x2, y1 + addi.d AO1, AO1, 2 * SIZE + MADD y2, a6, x2, y2 + addi.d AO2, AO2, 2 * SIZE + ST y1, YY, -2 * SIZE + ST y2, YY, -1 * SIZE + .align 3 + +.L17: + andi I, M, 1 + bge $r0, I, .L19 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD a5, AO2, 0 * SIZE + MADD y1, a1, x1, y1 + MADD y1, a5, x2, y1 + ST y1, YY, 0 * SIZE + .align 3 + +.L19: + addi.d J, J, -1 + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + bge $r0, J, .L900 + .align 3 + +.L21: + LD x1, X, 0 * SIZE + add.d X, X, INCX + move YY, YORIG + move AO1, A + srai.d I, M, 3 + MUL x1, ALPHA, x1 + bge $r0, I, .L25 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + LD y5, YY, 4 * SIZE + LD y6, YY, 5 * SIZE + LD y7, YY, 6 * SIZE + addi.d I, I, -1 + LD y8, YY, 7 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + LD y1, YY, 8 * SIZE + LD y2, YY, 9 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + LD y3, YY, 10 * SIZE + LD y4, YY, 11 * SIZE + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + ST t3, YY, 2 * SIZE + ST t4, YY, 3 * SIZE + MADD t1, a1, x1, y5 + LD a1, AO1, 8 * SIZE + MADD t2, a2, x1, y6 + LD a2, AO1, 9 * SIZE + LD y5, YY, 12 * SIZE + LD y6, YY, 13 * SIZE + MADD t3, a3, x1, y7 + LD a3, AO1, 10 * SIZE + MADD t4, a4, x1, y8 + LD a4, AO1, 11 * SIZE + LD y7, YY, 14 * SIZE + LD y8, YY, 15 * SIZE + ST t1, YY, 4 * SIZE + ST t2, YY, 5 * SIZE + ST t3, YY, 6 * SIZE + ST t4, YY, 7 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + ST t1, YY, 0 * SIZE + MADD t1, a1, x1, y5 + ST t2, YY, 1 * SIZE + MADD t2, a2, x1, y6 + ST t3, YY, 2 * SIZE + MADD t3, a3, x1, y7 + ST t4, YY, 3 * SIZE + MADD t4, a4, x1, y8 + ST t1, YY, 4 * SIZE + ST t2, YY, 5 * SIZE + ST t3, YY, 6 * SIZE + ST t4, YY, 7 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d YY, YY, 8 * SIZE + .align 3 + +.L25: + andi I, M, 4 + bge $r0, I, .L26 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + MADD y1, a1, x1, y1 + MADD y2, a2, x1, y2 + MADD y3, a3, x1, y3 + addi.d YY, YY, 4 * SIZE + MADD y4, a4, x1, y4 + addi.d AO1, AO1, 4 * SIZE + ST y1, YY, -4 * SIZE + ST y2, YY, -3 * SIZE + ST y3, YY, -2 * SIZE + ST y4, YY, -1 * SIZE + .align 3 + +.L26: + andi I, M, 2 + bge $r0, I, .L27 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + MADD y1, a1, x1, y1 + addi.d YY, YY, 2 * SIZE + MADD y2, a2, x1, y2 + addi.d AO1, AO1, 2 * SIZE + ST y1, YY, -2 * SIZE + ST y2, YY, -1 * SIZE + .align 3 + +.L27: + andi I, M, 1 + bge $r0, I, .L900 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + MADD y1, a1, x1, y1 + ST y1, YY, 0 * SIZE + .align 3 + +.L900: + li YORIG, SIZE + srai.d I, M, 2 + beq INCY, YORIG, .L999 + move XX, BUFFER + bge $r0, I, .L905 + .align 3 + +.L902: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + LD a3, XX, 2 * SIZE + LD a4, XX, 3 * SIZE + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + ST a2, Y, 0 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + add.d Y, Y, INCY + ST a4, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + addi.d XX, XX, 4 * SIZE + blt $r0, I, .L902 + .align 3 + +.L905: + andi I, M, 3 + bge $r0, I, .L999 + .align 3 + +.L906: + LD a1, XX, 0 * SIZE + addi.d XX, XX, 1 * SIZE + ST a1, Y, 0 * SIZE + addi.d I, I, -1 + add.d Y, Y, INCY + blt $r0, I, .L906 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 +#ifndef __64BIT__ + fld.d $f18, $sp, 16 + fld.d $f19, $sp, 24 + fld.d $f20, $sp, 32 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 16 +#else + addi.d $sp, $sp, 48 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/gemv_t.S b/kernel/loongarch64/gemv_t.S new file mode 100644 index 000000000..19333ed4a --- /dev/null +++ b/kernel/loongarch64/gemv_t.S @@ -0,0 +1,436 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Unused param dummy1 */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define XORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define ALPHA $f0 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define y1 $f14 +#define y2 $f15 +#define y3 $f16 +#define y4 $f17 +#define x1 $f3 +#define x2 $f1 +#define x3 $f2 +#define x4 $f4 +#define x5 $f5 +#define x6 $f6 +#define x7 $f7 +#define x8 $f18 + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifdef __64BIT__ + addi.d $sp, $sp, -16 +#else + addi.d $sp, $sp, -32 +#endif + MTC y1, $r0 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + slli.d LDA, LDA, BASE_SHIFT +#ifndef __64BIT__ + fst.d $f18, $sp, 16 +#endif + slli.d INCX, INCX, BASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + li I, SIZE + move XORIG, X + beq INCX, I, .L10 + srai.d I, M, 2 + move XORIG, BUFFER + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + ST a3, YY, 2 * SIZE + ST a4, YY, 3 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 4 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, YY, 0 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 1 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + move YY, Y + bge $r0, J, .L20 + .align 3 + +.L11: + move AO1, A + MOV y2, y1 + add.d AO2, A, LDA + MOV y3, y1 + add.d A, AO2, LDA + MOV y4, y1 + srai.d I, M, 3 + move XX, XORIG + bge $r0, I, .L15 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO2, 0 * SIZE + LD x2, XX, 1 * SIZE + LD a3, AO1, 1 * SIZE + LD x3, XX, 2 * SIZE + LD a4, AO2, 1 * SIZE + LD x4, XX, 3 * SIZE + LD a5, AO1, 2 * SIZE + LD x5, XX, 4 * SIZE + LD a6, AO2, 2 * SIZE + LD x6, XX, 5 * SIZE + LD a7, AO1, 3 * SIZE + LD x7, XX, 6 * SIZE + LD a8, AO2, 3 * SIZE + addi.d I, I, -1 + LD x8, XX, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y2, a2, x1, y2 + LD a2, AO2, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + MADD y4, a4, x2, y4 + LD a4, AO2, 5 * SIZE + LD x1, XX, 8 * SIZE + LD x2, XX, 9 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y2, a6, x3, y2 + LD a6, AO2, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + MADD y4, a8, x4, y4 + LD a8, AO2, 7 * SIZE + LD x3, XX, 10 * SIZE + LD x4, XX, 11 * SIZE + MADD y1, a1, x5, y1 + LD a1, AO1, 8 * SIZE + MADD y2, a2, x5, y2 + LD a2, AO2, 8 * SIZE + MADD y3, a3, x6, y3 + LD a3, AO1, 9 * SIZE + MADD y4, a4, x6, y4 + LD a4, AO2, 9 * SIZE + LD x5, XX, 12 * SIZE + LD x6, XX, 13 * SIZE + MADD y1, a5, x7, y1 + LD a5, AO1, 10 * SIZE + MADD y2, a6, x7, y2 + LD a6, AO2, 10 * SIZE + MADD y3, a7, x8, y3 + LD a7, AO1, 11 * SIZE + MADD y4, a8, x8, y4 + LD a8, AO2, 11 * SIZE + LD x7, XX, 14 * SIZE + LD x8, XX, 15 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d AO2, AO2, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y2, a2, x1, y2 + LD a2, AO2, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + MADD y4, a4, x2, y4 + LD a4, AO2, 5 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y2, a6, x3, y2 + LD a6, AO2, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + MADD y4, a8, x4, y4 + LD a8, AO2, 7 * SIZE + MADD y1, a1, x5, y1 + MADD y2, a2, x5, y2 + MADD y3, a3, x6, y3 + MADD y4, a4, x6, y4 + MADD y1, a5, x7, y1 + addi.d XX, XX, 8 * SIZE + MADD y2, a6, x7, y2 + addi.d AO1, AO1, 8 * SIZE + MADD y3, a7, x8, y3 + addi.d AO2, AO2, 8 * SIZE + MADD y4, a8, x8, y4 + .align 3 + +.L15: + andi I, M, 4 + bge $r0, I, .L17 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO2, 0 * SIZE + LD a3, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a4, AO2, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x3, XX, 2 * SIZE + MADD y1, a1, x1, y1 + LD a6, AO2, 2 * SIZE + MADD y2, a2, x1, y2 + LD a7, AO1, 3 * SIZE + MADD y3, a3, x2, y3 + LD x4, XX, 3 * SIZE + MADD y4, a4, x2, y4 + LD a8, AO2, 3 * SIZE + MADD y1, a5, x3, y1 + MADD y2, a6, x3, y2 + addi.d XX, XX, 4 * SIZE + MADD y3, a7, x4, y3 + addi.d AO1, AO1, 4 * SIZE + MADD y4, a8, x4, y4 + addi.d AO2, AO2, 4 * SIZE + .align 3 + +.L17: + andi I, M, 3 + ADD y1, y1, y3 + ADD y2, y2, y4 + bge $r0, I, .L19 + .align 3 +.L18: + LD x1, XX, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD a2, AO2, 0 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 1 * SIZE + addi.d AO1, AO1, 1 * SIZE + addi.d AO2, AO2, 1 * SIZE + MADD y1, a1, x1, y1 + MADD y2, a2, x1, y2 + blt $r0, I, .L18 + .align 3 + +.L19: + LD a1, Y, 0 * SIZE + add.d Y, Y, INCY + LD a2, Y, 0 * SIZE + add.d Y, Y, INCY + MADD a1, y1, ALPHA, a1 + addi.d J, J, -1 + MADD a2, y2, ALPHA, a2 + MTC y1, $r0 + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + ST a2, YY, 0 * SIZE + add.d YY, YY, INCY + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + MOV y3, y1 + move AO1, A + bge $r0, J, .L999 + srai.d I, M, 3 + move XX, XORIG + bge $r0, I, .L25 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a3, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x3, XX, 2 * SIZE + LD a7, AO1, 3 * SIZE + LD x4, XX, 3 * SIZE + LD x5, XX, 4 * SIZE + LD x6, XX, 5 * SIZE + LD x7, XX, 6 * SIZE + addi.d I, I, -1 + LD x8, XX, 7 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + LD x1, XX, 8 * SIZE + LD x2, XX, 9 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + LD x3, XX, 10 * SIZE + LD x4, XX, 11 * SIZE + MADD y1, a1, x5, y1 + LD a1, AO1, 8 * SIZE + MADD y3, a3, x6, y3 + LD a3, AO1, 9 * SIZE + LD x5, XX, 12 * SIZE + LD x6, XX, 13 * SIZE + MADD y1, a5, x7, y1 + LD a5, AO1, 10 * SIZE + MADD y3, a7, x8, y3 + LD a7, AO1, 11 * SIZE + LD x7, XX, 14 * SIZE + LD x8, XX, 15 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + MADD y1, a1, x5, y1 + MADD y3, a3, x6, y3 + MADD y1, a5, x7, y1 + MADD y3, a7, x8, y3 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + .align 3 + +.L25: + andi I, M, 4 + bge $r0, I, .L27 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a3, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x3, XX, 2 * SIZE + MADD y1, a1, x1, y1 + LD a7, AO1, 3 * SIZE + MADD y3, a3, x2, y3 + LD x4, XX, 3 * SIZE + MADD y1, a5, x3, y1 + addi.d XX, XX, 4 * SIZE + MADD y3, a7, x4, y3 + addi.d AO1, AO1, 4 * SIZE + .align 3 + +.L27: + andi I, M, 3 + ADD y1, y1, y3 + bge $r0, I, .L29 + .align 3 +.L28: + LD x1, XX, 0 * SIZE + LD a1, AO1, 0 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 1 * SIZE + addi.d AO1, AO1, 1 * SIZE + MADD y1, a1, x1, y1 + blt $r0, I, .L28 + .align 3 + +.L29: + LD a1, Y, 0 * SIZE + add.d Y, Y, INCY + MADD a1, y1, ALPHA, a1 + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 +#ifndef __64BIT__ + fld.d $f18, $sp, 16 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 16 +#else + addi.d $sp, $sp, 32 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/iamax.S b/kernel/loongarch64/iamax.S new file mode 100644 index 000000000..0f9e1bc59 --- /dev/null +++ b/kernel/loongarch64/iamax.S @@ -0,0 +1,233 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + li x1, 1 + bge $r0, N, .L999 + FABS s1, a1 + add.d X, X, INCX + FABS s2, a1 + li x2, 1 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + li x3, 1 + li TEMP, 2 + li x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d I, I, -1 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + FABS t1, a5 + addi.d TEMP, TEMP, 4 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + add.d X, X, INCX + FABS t1, a1 + addi.d I, I, -1 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/iamin.S b/kernel/loongarch64/iamin.S new file mode 100644 index 000000000..7751a9d03 --- /dev/null +++ b/kernel/loongarch64/iamin.S @@ -0,0 +1,233 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + li x1, 1 + bge $r0, N, .L999 + FABS s1, a1 + add.d X, X, INCX + FABS s2, a1 + li x2, 1 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + li x3, 1 + li TEMP, 2 + li x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d I, I, -1 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + FABS t1, a5 + addi.d TEMP, TEMP, 4 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + add.d X, X, INCX + FABS t1, a1 + addi.d I, I, -1 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/izamax.S b/kernel/loongarch64/izamax.S new file mode 100644 index 000000000..6d7cb9e30 --- /dev/null +++ b/kernel/loongarch64/izamax.S @@ -0,0 +1,217 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + ADD s2, t1, t2 + ADD s3, t1, t2 + ADD s4, t1, t2 + addi.d N, N, -1 + li x1, 1 + bge $r0, N, .L999 + add.d X, X, INCX + li x2, 1 + srai.d I, N, 2 + li x3, 1 + li TEMP, 2 + li x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t3 + LD a8, X, 1 * SIZE + CMPLT $fcc2, s3, t5 + add.d X, X, INCX + CMPLT $fcc3, s4, t7 + addi.d I, I, -1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t3 + CMPLT $fcc2, s3, t5 + CMPLT $fcc3, s4, t7 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + addi.d I, I, -1 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/izamin.S b/kernel/loongarch64/izamin.S new file mode 100644 index 000000000..998927985 --- /dev/null +++ b/kernel/loongarch64/izamin.S @@ -0,0 +1,217 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + ADD s2, t1, t2 + ADD s3, t1, t2 + ADD s4, t1, t2 + addi.d N, N, -1 + li x1, 1 + bge $r0, N, .L999 + add.d X, X, INCX + li x2, 1 + srai.d I, N, 2 + li x3, 1 + li TEMP, 2 + li x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t3, s2 + LD a8, X, 1 * SIZE + CMPLT $fcc2, t5, s3 + add.d X, X, INCX + CMPLT $fcc3, t7, s4 + addi.d I, I, -1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t3, s2 + CMPLT $fcc2, t5, s3 + CMPLT $fcc3, t7, s4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + addi.d I, I, -1 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/max.S b/kernel/loongarch64/max.S new file mode 100644 index 000000000..56c3f99a1 --- /dev/null +++ b/kernel/loongarch64/max.S @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD s1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + MOV s2, s1 + bge $r0, N, .L999 + MOV s3, s1 + srai.d I, N, 3 + MOV s4, s1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + CMPLT $fcc0, s1, a1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, a2 + add.d X, X, INCX + CMPLT $fcc2, s3, a3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, a4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + LD a1, X, 0 * SIZE + CMOVT s2, s2, a2, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a3, $fcc2 + LD a2, X, 0 * SIZE + CMOVT s4, s4, a4, $fcc3 + add.d X, X, INCX + CMPLT $fcc0, s1, a5 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, a6 + add.d X, X, INCX + CMPLT $fcc2, s3, a7 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, a8 + add.d X, X, INCX + CMOVT s1, s1, a5, $fcc0 + LD a5, X, 0 * SIZE + CMOVT s2, s2, a6, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a7, $fcc2 + LD a6, X, 0 * SIZE + CMOVT s4, s4, a8, $fcc3 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L12 + .align 3 + +.L13: + CMPLT $fcc0, s1, a1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, a2 + add.d X, X, INCX + CMPLT $fcc2, s3, a3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, a4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + CMOVT s2, s2, a2, $fcc1 + CMOVT s3, s3, a3, $fcc2 + CMOVT s4, s4, a4, $fcc3 + CMPLT $fcc0, s1, a5 + CMPLT $fcc1, s2, a6 + CMPLT $fcc2, s3, a7 + CMPLT $fcc3, s4, a8 + CMOVT s1, s1, a5, $fcc0 + CMOVT s2, s2, a6, $fcc1 + CMOVT s3, s3, a7, $fcc2 + CMOVT s4, s4, a8, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + CMPLT $fcc0, s1, a1 + CMOVT s1, s1, a1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/min.S b/kernel/loongarch64/min.S new file mode 100644 index 000000000..bb2fcfb01 --- /dev/null +++ b/kernel/loongarch64/min.S @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD s1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + MOV s2, s1 + bge $r0, N, .L999 + MOV s3, s1 + srai.d I, N, 3 + MOV s4, s1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + CMPLT $fcc0, a1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, a2, s2 + add.d X, X, INCX + CMPLT $fcc2, a3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, a4, s4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + LD a1, X, 0 * SIZE + CMOVT s2, s2, a2, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a3, $fcc2 + LD a2, X, 0 * SIZE + CMOVT s4, s4, a4, $fcc3 + add.d X, X, INCX + CMPLT $fcc0, a5, s1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, a6, s2 + add.d X, X, INCX + CMPLT $fcc2, a7, s3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, a8, s4 + add.d X, X, INCX + CMOVT s1, s1, a5, $fcc0 + LD a5, X, 0 * SIZE + CMOVT s2, s2, a6, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a7, $fcc2 + LD a6, X, 0 * SIZE + CMOVT s4, s4, a8, $fcc3 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L12 + .align 3 + +.L13: + CMPLT $fcc0, a1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, a2, s2 + add.d X, X, INCX + CMPLT $fcc2, a3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, a4, s4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + CMOVT s2, s2, a2, $fcc1 + CMOVT s3, s3, a3, $fcc2 + CMOVT s4, s4, a4, $fcc3 + CMPLT $fcc0, a5, s1 + CMPLT $fcc1, a6, s2 + CMPLT $fcc2, a7, s3 + CMPLT $fcc3, a8, s4 + CMOVT s1, s1, a5, $fcc0 + CMOVT s2, s2, a6, $fcc1 + CMOVT s3, s3, a7, $fcc2 + CMOVT s4, s4, a8, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + CMPLT $fcc0, a1, s1 + CMOVT s1, s1, a1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S new file mode 100644 index 000000000..7399e57b3 --- /dev/null +++ b/kernel/loongarch64/scal.S @@ -0,0 +1,330 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r7 +#define INCX $r8 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define ALPHA $f0 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define t1 $f14 +#define t2 $f15 +#define t3 $f16 +#define t4 $f17 + + PROLOGUE + + li TEMP, SIZE + MTC a1, $r0 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, N, .L999 + CMPEQ $fcc0, ALPHA, a1 + bceqz $fcc0, .L50 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + .align 3 + +.L12: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + ST a1, X, 2 * SIZE + ST a1, X, 3 * SIZE + ST a1, X, 4 * SIZE + ST a1, X, 5 * SIZE + ST a1, X, 6 * SIZE + ST a1, X, 7 * SIZE + addi.w I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L16 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: + srai.d I, N, 3 + bge $r0, I, .L25 + .align 3 + +.L22: + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L26: + addi.d I, I, -1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L26 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L50: + srai.d I, N, 3 + bne INCX, TEMP, .L60 + addi.d I, I, -1 + blt I, $r0, .L55 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + LD a7, X, 6 * SIZE + LD a8, X, 7 * SIZE + bge $r0, I, .L53 + .align 3 + +.L52: + MUL t1, ALPHA, a1 + LD a1, X, 8 * SIZE + MUL t2, ALPHA, a2 + LD a2, X, 9 * SIZE + MUL t3, ALPHA, a3 + LD a3, X, 10 * SIZE + MUL t4, ALPHA, a4 + LD a4, X, 11 * SIZE + ST t1, X, 0 * SIZE + MUL t1, ALPHA, a5 + LD a5, X, 12 * SIZE + ST t2, X, 1 * SIZE + MUL t2, ALPHA, a6 + LD a6, X, 13 * SIZE + ST t3, X, 2 * SIZE + MUL t3, ALPHA, a7 + LD a7, X, 14 * SIZE + ST t4, X, 3 * SIZE + MUL t4, ALPHA, a8 + LD a8, X, 15 * SIZE + addi.d I, I, -1 + ST t1, X, 4 * SIZE + ST t2, X, 5 * SIZE + ST t3, X, 6 * SIZE + ST t4, X, 7 * SIZE + addi.d X, X, 8 * SIZE + blt $r0, I, .L52 + .align 3 + +.L53: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + ST t1, X, 0 * SIZE + MUL t1, ALPHA, a5 + ST t2, X, 1 * SIZE + MUL t2, ALPHA, a6 + ST t3, X, 2 * SIZE + MUL t3, ALPHA, a7 + ST t4, X, 3 * SIZE + MUL t4, ALPHA, a8 + ST t1, X, 4 * SIZE + ST t2, X, 5 * SIZE + ST t3, X, 6 * SIZE + ST t4, X, 7 * SIZE + addi.d X, X, 8 * SIZE + .align 3 + +.L55: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L56: + LD a1, X, 0 * SIZE + MUL t1, ALPHA, a1 + addi.d X, X, SIZE + addi.d I, I, -1 + ST t1, X, -1 * SIZE + blt $r0, I, .L56 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L60: + srai.d I, N, 3 + move XX, X + addi.d I, I, -1 + blt I, $r0, .L65 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + add.d X, X, INCX + bge $r0, I, .L63 + .align 3 + +.L62: + MUL t1, ALPHA, a1 + LD a1, X, 0 * SIZE + add.d X, X, INCX + MUL t2, ALPHA, a2 + LD a2, X, 0 * SIZE + add.d X, X, INCX + MUL t3, ALPHA, a3 + LD a3, X, 0 * SIZE + add.d X, X, INCX + MUL t4, ALPHA, a4 + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + LD a5, X, 0 * SIZE + add.d X, X, INCX + MUL t2, ALPHA, a6 + LD a6, X, 0 * SIZE + add.d X, X, INCX + MUL t3, ALPHA, a7 + LD a7, X, 0 * SIZE + add.d X, X, INCX + MUL t4, ALPHA, a8 + LD a8, X, 0 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + addi.d I, I, -1 + add.d XX, XX, INCX + blt $r0, I, .L62 + .align 3 + +.L63: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + add.d XX, XX, INCX + .align 3 + +.L65: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L66: + LD a1, X, 0 * SIZE + MUL t1, ALPHA, a1 + addi.d I, I, -1 + ST t1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L66 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S new file mode 100644 index 000000000..14b62cfe7 --- /dev/null +++ b/kernel/loongarch64/snrm2.S @@ -0,0 +1,249 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define a5 $f16 +#define a6 $f17 +#define a7 $f0 +#define a8 $f1 +#define s1 $f22 +#define s2 $f8 +#define t1 $f23 +#define t2 $f9 +#define t3 $f10 +#define t4 $f11 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + movgr2fr.d s1, $r0 + li TEMP, SIZE + fmov.d s2, s1 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + LD a6, X, 5 * SIZE + fcvt.d.s t2, a2 + LD a7, X, 6 * SIZE + fcvt.d.s t3, a3 + LD a8, X, 7 * SIZE + fcvt.d.s t4, a4 + bge $r0, I, .L13 + .align 3 + +.L12: + fmadd.d s1, t1, t1, s1 + LD a1, X, 8 * SIZE + fcvt.d.s t1, a5 + NOP + fmadd.d s2, t2, t2, s2 + LD a2, X, 9 * SIZE + fcvt.d.s t2, a6 + NOP + fmadd.d s1, t3, t3, s1 + LD a3, X, 10 * SIZE + fcvt.d.s t3, a7 + NOP + fmadd.d s2, t4, t4, s2 + LD a4, X, 11 * SIZE + fcvt.d.s t4, a8 + NOP + fmadd.d s1, t1, t1, s1 + LD a5, X, 12 * SIZE + fcvt.d.s t1, a1 + NOP + fmadd.d s2, t2, t2, s2 + LD a6, X, 13 * SIZE + fcvt.d.s t2, a2 + addi.d I, I, -1 + fmadd.d s1, t3, t3, s1 + LD a7, X, 14 * SIZE + fcvt.d.s t3, a3 + addi.d X, X, 8 * SIZE + fmadd.d s2, t4, t4, s2 + LD a8, X, 7 * SIZE + fcvt.d.s t4, a4 + blt $r0, I, .L12 + .align 3 + +.L13: + fmadd.d s1, t1, t1, s1 + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + fcvt.d.s t2, a6 + fmadd.d s1, t3, t3, s1 + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + fcvt.d.s t4, a8 + fmadd.d s1, t1, t1, s1 + fmadd.d s2, t2, t2, s2 + fmadd.d s1, t3, t3, s1 + fmadd.d s2, t4, t4, s2 + addi.d X, X, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + fmadd.d s1, t1, t1, s1 + addi.d X, X, SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + fcvt.d.s t2, a2 + fcvt.d.s t3, a3 + fcvt.d.s t4, a4 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 + +.L23: + fmadd.d s1, t1, t1, s1 + LD a1, X, 0 * SIZE + fcvt.d.s t1, a5 + add.d X, X, INCX + fmadd.d s2, t2, t2, s2 + LD a2, X, 0 * SIZE + fcvt.d.s t2, a6 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a3, X, 0 * SIZE + fcvt.d.s t3, a7 + add.d X, X, INCX + fmadd.d s2, t4, t4, s2 + LD a4, X, 0 * SIZE + fcvt.d.s t4, a8 + add.d X, X, INCX + fmadd.d s1, t1, t1, s1 + LD a5, X, 0 * SIZE + fcvt.d.s t1, a1 + add.d X, X, INCX + fmadd.d s2, t2, t2, s2 + LD a6, X, 0 * SIZE + fcvt.d.s t2, a2 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a7, X, 0 * SIZE + fcvt.d.s t3, a3 + add.d X, X, INCX + fmadd.d s2, t4, t4, s2 + LD a8, X, 0 * SIZE + fcvt.d.s t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + .align 3 + +.L24: + fmadd.d s1, t1, t1, s1 + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + fcvt.d.s t2, a6 + fmadd.d s1, t3, t3, s1 + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + fcvt.d.s t4, a8 + fmadd.d s1, t1, t1, s1 + fmadd.d s2, t2, t2, s2 + fmadd.d s1, t3, t3, s1 + fmadd.d s2, t4, t4, s2 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + add.d X, X, INCX + fmadd.d s1, t1, t1, s1 + blt $r0, I, .L26 + .align 3 + +.L999: + fadd.d s1, s1, s2 + fsqrt.d s1, s1 + move $r4, $r17 + fcvt.s.d $f0, s1 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/swap.S b/kernel/loongarch64/swap.S new file mode 100644 index 000000000..c9d8f7fc1 --- /dev/null +++ b/kernel/loongarch64/swap.S @@ -0,0 +1,330 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define b1 $f14 +#define b2 $f15 +#define b3 $f16 +#define b4 $f17 +#define b5 $f0 +#define b6 $f1 +#define b7 $f2 +#define b8 $f3 + + PROLOGUE + + li TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + bge $r0, N, .L999 + slli.d INCY, INCY, BASE_SHIFT + bne INCX, TEMP, .L20 + srai.d I, N, 3 + bne INCY, TEMP, .L20 + addi.d I, I, -1 + blt I, $r0, .L15 + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + LD a2, X, 1 * SIZE + LD b2, Y, 1 * SIZE + LD a3, X, 2 * SIZE + LD b3, Y, 2 * SIZE + LD a4, X, 3 * SIZE + LD b4, Y, 3 * SIZE + LD a5, X, 4 * SIZE + LD b5, Y, 4 * SIZE + LD a6, X, 5 * SIZE + LD b6, Y, 5 * SIZE + LD a7, X, 6 * SIZE + LD b7, Y, 6 * SIZE + LD a8, X, 7 * SIZE + LD b8, Y, 7 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: + ST a1, Y, 0 * SIZE + LD a1, X, 8 * SIZE + ST b1, X, 0 * SIZE + LD b1, Y, 8 * SIZE + ST a2, Y, 1 * SIZE + LD a2, X, 9 * SIZE + ST b2, X, 1 * SIZE + LD b2, Y, 9 * SIZE + ST a3, Y, 2 * SIZE + LD a3, X, 10 * SIZE + ST b3, X, 2 * SIZE + LD b3, Y, 10 * SIZE + ST a4, Y, 3 * SIZE + LD a4, X, 11 * SIZE + ST b4, X, 3 * SIZE + LD b4, Y, 11 * SIZE + ST a5, Y, 4 * SIZE + LD a5, X, 12 * SIZE + ST b5, X, 4 * SIZE + LD b5, Y, 12 * SIZE + ST a6, Y, 5 * SIZE + LD a6, X, 13 * SIZE + ST b6, X, 5 * SIZE + LD b6, Y, 13 * SIZE + ST a7, Y, 6 * SIZE + LD a7, X, 14 * SIZE + ST b7, X, 6 * SIZE + LD b7, Y, 14 * SIZE + ST a8, Y, 7 * SIZE + LD a8, X, 15 * SIZE + ST b8, X, 7 * SIZE + LD b8, Y, 15 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST a1, Y, 0 * SIZE + ST b1, X, 0 * SIZE + ST a2, Y, 1 * SIZE + ST b2, X, 1 * SIZE + ST a3, Y, 2 * SIZE + ST b3, X, 2 * SIZE + ST a4, Y, 3 * SIZE + ST b4, X, 3 * SIZE + ST a5, Y, 4 * SIZE + ST b5, X, 4 * SIZE + ST a6, Y, 5 * SIZE + ST b6, X, 5 * SIZE + ST a7, Y, 6 * SIZE + ST b7, X, 6 * SIZE + ST a8, Y, 7 * SIZE + ST b8, X, 7 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d I, I, -1 + addi.d Y, Y, SIZE + ST b1, X, -1 * SIZE + ST a1, Y, -1 * SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: + srai.d I, N, 3 + move XX, X + move YY, Y + addi.d I, I, -1 + blt I, $r0, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD b2, Y, 0 * SIZE + add.d Y, Y, INCY + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD b3, Y, 0 * SIZE + add.d Y, Y, INCY + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD b4, Y, 0 * SIZE + add.d Y, Y, INCY + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD b5, Y, 0 * SIZE + add.d Y, Y, INCY + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD b6, Y, 0 * SIZE + add.d Y, Y, INCY + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD b7, Y, 0 * SIZE + add.d Y, Y, INCY + LD a8, X, 0 * SIZE + add.d X, X, INCX + LD b8, Y, 0 * SIZE + add.d Y, Y, INCY + bge $r0, I, .L23 + .align 3 + +.L22: + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + LD a1, X, 0 * SIZE + add.d X, X, INCX + ST b1, XX, 0 * SIZE + add.d XX, XX, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + ST a2, YY, 0 * SIZE + add.d YY, YY, INCY + LD a2, X, 0 * SIZE + add.d X, X, INCX + ST b2, XX, 0 * SIZE + add.d XX, XX, INCX + LD b2, Y, 0 * SIZE + add.d Y, Y, INCY + ST a3, YY, 0 * SIZE + add.d YY, YY, INCY + LD a3, X, 0 * SIZE + add.d X, X, INCX + ST b3, XX, 0 * SIZE + add.d XX, XX, INCX + LD b3, Y, 0 * SIZE + add.d Y, Y, INCY + ST a4, YY, 0 * SIZE + add.d YY, YY, INCY + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST b4, XX, 0 * SIZE + add.d XX, XX, INCX + LD b4, Y, 0 * SIZE + add.d Y, Y, INCY + ST a5, YY, 0 * SIZE + add.d YY, YY, INCY + LD a5, X, 0 * SIZE + add.d X, X, INCX + ST b5, XX, 0 * SIZE + add.d XX, XX, INCX + LD b5, Y, 0 * SIZE + add.d Y, Y, INCY + ST a6, YY, 0 * SIZE + add.d YY, YY, INCY + LD a6, X, 0 * SIZE + add.d X, X, INCX + ST b6, XX, 0 * SIZE + add.d XX, XX, INCX + LD b6, Y, 0 * SIZE + add.d Y, Y, INCY + ST a7, YY, 0 * SIZE + add.d YY, YY, INCY + LD a7, X, 0 * SIZE + add.d X, X, INCX + ST b7, XX, 0 * SIZE + add.d XX, XX, INCX + LD b7, Y, 0 * SIZE + add.d Y, Y, INCY + ST a8, YY, 0 * SIZE + add.d YY, YY, INCY + LD a8, X, 0 * SIZE + add.d X, X, INCX + ST b8, XX, 0 * SIZE + add.d XX, XX, INCX + LD b8, Y, 0 * SIZE + addi.d I, I, -1 + add.d Y, Y, INCY + blt $r0, I, .L22 + .align 3 + +.L23: + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + ST b1, XX, 0 * SIZE + add.d XX, XX, INCX + ST a2, YY, 0 * SIZE + add.d YY, YY, INCY + ST b2, XX, 0 * SIZE + add.d XX, XX, INCX + ST a3, YY, 0 * SIZE + add.d YY, YY, INCY + ST b3, XX, 0 * SIZE + add.d XX, XX, INCX + ST a4, YY, 0 * SIZE + add.d YY, YY, INCY + ST b4, XX, 0 * SIZE + add.d XX, XX, INCX + ST a5, YY, 0 * SIZE + add.d YY, YY, INCY + ST b5, XX, 0 * SIZE + add.d XX, XX, INCX + ST a6, YY, 0 * SIZE + add.d YY, YY, INCY + ST b6, XX, 0 * SIZE + add.d XX, XX, INCX + ST a7, YY, 0 * SIZE + add.d YY, YY, INCY + ST b7, XX, 0 * SIZE + add.d XX, XX, INCX + ST a8, YY, 0 * SIZE + add.d YY, YY, INCY + ST b8, XX, 0 * SIZE + add.d XX, XX, INCX + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L26: + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST b1, X, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L26 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_LN.S b/kernel/loongarch64/trsm_kernel_LN.S new file mode 100644 index 000000000..a0bd29f3b --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_LN.S @@ -0,0 +1,2863 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r29 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define KK $r30 +#define TEMP $r20 +#define AORIG $r16 +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -144 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + SDARG $r29, $sp, 88 + SDARG $r30, $sp, 96 + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#ifndef __64BIT__ + fst.d $f18, $sp, 112 + fst.d $f19, $sp, 120 + fst.d $f20, $sp, 128 + fst.d $f21, $sp, 136 +#endif + slli.d LDC, LDC, BASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, BASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + neg KK, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + srai.d J, N, 3 +nop + bge $r0, J, .L30 +.L10: +#ifdef RT + slli.d TEMP, K, 3 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 3 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO8, LDC +#endif + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L20 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, 0 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + MOV c81, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + NMSUB c51, c11, b5, c51 + NMSUB c61, c11, b6, c61 + NMSUB c71, c11, b7, c71 + NMSUB c81, c11, b8, c81 + LD b2, BO, 9 * SIZE + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + NMSUB c51, c21, b5, c51 + NMSUB c61, c21, b6, c61 + NMSUB c71, c21, b7, c71 + NMSUB c81, c21, b8, c81 + LD b3, BO, 18 * SIZE + LD b4, BO, 19 * SIZE + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + NMSUB c51, c31, b5, c51 + NMSUB c61, c31, b6, c61 + NMSUB c71, c31, b7, c71 + NMSUB c81, c31, b8, c81 + LD b4, BO, 27 * SIZE + LD b5, BO, 28 * SIZE + LD b6, BO, 29 * SIZE + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL c41, b4, c41 + NMSUB c51, c41, b5, c51 + NMSUB c61, c41, b6, c61 + NMSUB c71, c41, b7, c71 + NMSUB c81, c41, b8, c81 + LD b5, BO, 36 * SIZE + LD b6, BO, 37 * SIZE + LD b7, BO, 38 * SIZE + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + NMSUB c61, c51, b6, c61 + NMSUB c71, c51, b7, c71 + NMSUB c81, c51, b8, c81 + LD b6, BO, 45 * SIZE + LD b7, BO, 46 * SIZE + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + NMSUB c71, c61, b7, c71 + NMSUB c81, c61, b8, c81 + LD b7, BO, 54 * SIZE + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + NMSUB c81, c71, b8, c81 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + LD b5, BO, 59 * SIZE + LD b6, BO, 58 * SIZE + LD b7, BO, 57 * SIZE + LD b8, BO, 56 * SIZE + MUL c81, b1, c81 + NMSUB c71, c81, b2, c71 + NMSUB c61, c81, b3, c61 + NMSUB c51, c81, b4, c51 + NMSUB c41, c81, b5, c41 + NMSUB c31, c81, b6, c31 + NMSUB c21, c81, b7, c21 + NMSUB c11, c81, b8, c11 + LD b2, BO, 54 * SIZE + LD b3, BO, 53 * SIZE + LD b4, BO, 52 * SIZE + LD b5, BO, 51 * SIZE + LD b6, BO, 50 * SIZE + LD b7, BO, 49 * SIZE + LD b8, BO, 48 * SIZE + MUL c71, b2, c71 + NMSUB c61, c71, b3, c61 + NMSUB c51, c71, b4, c51 + NMSUB c41, c71, b5, c41 + NMSUB c31, c71, b6, c31 + NMSUB c21, c71, b7, c21 + NMSUB c11, c71, b8, c11 + LD b3, BO, 45 * SIZE + LD b4, BO, 44 * SIZE + LD b5, BO, 43 * SIZE + LD b6, BO, 42 * SIZE + LD b7, BO, 41 * SIZE + LD b8, BO, 40 * SIZE + MUL c61, b3, c61 + NMSUB c51, c61, b4, c51 + NMSUB c41, c61, b5, c41 + NMSUB c31, c61, b6, c31 + NMSUB c21, c61, b7, c21 + NMSUB c11, c61, b8, c11 + LD b4, BO, 36 * SIZE + LD b5, BO, 35 * SIZE + LD b6, BO, 34 * SIZE + LD b7, BO, 33 * SIZE + LD b8, BO, 32 * SIZE + MUL c51, b4, c51 + NMSUB c41, c51, b5, c41 + NMSUB c31, c51, b6, c31 + NMSUB c21, c51, b7, c21 + NMSUB c11, c51, b8, c11 + LD b5, BO, 27 * SIZE + LD b6, BO, 26 * SIZE + LD b7, BO, 25 * SIZE + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 18 * SIZE + LD b7, BO, 17 * SIZE + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE + addi.d CO5, CO5, -1 * SIZE + addi.d CO6, CO6, -1 * SIZE + addi.d CO7, CO7, -1 * SIZE + addi.d CO8, CO8, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c61, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c81, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +MTC c11, $r0 +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE + addi.d CO5, CO5, 1 * SIZE + addi.d CO6, CO6, 1 * SIZE + addi.d CO7, CO7, 1 * SIZE + addi.d CO8, CO8, 1 * SIZE +#endif + MOV c21, c11 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, c11 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif + MOV c41, c11 +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L20: + srai.d I, M, 1 + MOV c51, c11 +MOV c61, c11 + bge $r0, I, .L29 +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + LD b5, BO, 4 * SIZE + SUB c21, b2, c21 + LD b6, BO, 5 * SIZE + SUB c31, b3, c31 + LD b7, BO, 6 * SIZE + SUB c41, b4, c41 + LD b8, BO, 7 * SIZE + SUB c51, b5, c51 + LD b1, BO, 8 * SIZE + SUB c61, b6, c61 + LD b2, BO, 9 * SIZE + SUB c71, b7, c71 + LD b3, BO, 10 * SIZE + SUB c81, b8, c81 + LD b4, BO, 11 * SIZE + SUB c12, b1, c12 + LD b5, BO, 12 * SIZE + SUB c22, b2, c22 + LD b6, BO, 13 * SIZE + SUB c32, b3, c32 + LD b7, BO, 14 * SIZE + SUB c42, b4, c42 + LD b8, BO, 15 * SIZE + SUB c52, b5, c52 +#ifdef LN + LD b1, AO, 3 * SIZE +#else + LD b1, AO, 0 * SIZE +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + LD b5, AO, 4 * SIZE + SUB c12, b2, c12 + LD b6, AO, 5 * SIZE + SUB c21, b3, c21 + LD b7, AO, 6 * SIZE + SUB c22, b4, c22 + LD b8, AO, 7 * SIZE + SUB c31, b5, c31 + LD b1, AO, 8 * SIZE + SUB c32, b6, c32 + LD b2, AO, 9 * SIZE + SUB c41, b7, c41 + LD b3, AO, 10 * SIZE + SUB c42, b8, c42 + LD b4, AO, 11 * SIZE + LD b5, AO, 12 * SIZE + SUB c51, b1, c51 + LD b6, AO, 13 * SIZE + SUB c52, b2, c52 + LD b7, AO, 14 * SIZE + SUB c61, b3, c61 + LD b8, AO, 15 * SIZE + SUB c62, b4, c62 + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif +#ifdef LN + MUL c12, b1, c12 + LD b2, AO, 2 * SIZE + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + NMSUB c11, c12, b2, c11 + LD b3, AO, 0 * SIZE + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + NMSUB c51, c52, b2, c51 + NMSUB c61, c62, b2, c61 + NMSUB c71, c72, b2, c71 + NMSUB c81, c82, b2, c81 + MUL c11, b3, c11 + addi.d CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + addi.d CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + addi.d CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + addi.d CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + addi.d CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + addi.d CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + addi.d CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + addi.d CO8, CO8, -2 * SIZE +#endif +#ifdef LT + MUL c11, b1, c11 + LD b2, AO, 1 * SIZE + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + NMSUB c12, c11, b2, c12 + LD b3, AO, 3 * SIZE + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + NMSUB c52, c51, b2, c52 + NMSUB c62, c61, b2, c62 + NMSUB c72, c71, b2, c72 + NMSUB c82, c81, b2, c82 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, BO, 4 * SIZE + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + LD b6, BO, 5 * SIZE + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + LD b7, BO, 6 * SIZE + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b8, BO, 7 * SIZE + NMSUB c51, c11, b5, c51 + NMSUB c52, c12, b5, c52 + LD b2, BO, 9 * SIZE + NMSUB c61, c11, b6, c61 + NMSUB c62, c12, b6, c62 + LD b3, BO, 10 * SIZE + NMSUB c71, c11, b7, c71 + NMSUB c72, c12, b7, c72 + LD b4, BO, 11 * SIZE + NMSUB c81, c11, b8, c81 + NMSUB c82, c12, b8, c82 + LD b5, BO, 12 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, BO, 13 * SIZE + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + LD b7, BO, 14 * SIZE + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b8, BO, 15 * SIZE + NMSUB c51, c21, b5, c51 + NMSUB c52, c22, b5, c52 + LD b3, BO, 18 * SIZE + NMSUB c61, c21, b6, c61 + NMSUB c62, c22, b6, c62 + LD b4, BO, 19 * SIZE + NMSUB c71, c21, b7, c71 + NMSUB c72, c22, b7, c72 + LD b5, BO, 20 * SIZE + NMSUB c81, c21, b8, c81 + NMSUB c82, c22, b8, c82 + LD b6, BO, 21 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, BO, 22 * SIZE + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b8, BO, 23 * SIZE + NMSUB c51, c31, b5, c51 + NMSUB c52, c32, b5, c52 + LD b4, BO, 27 * SIZE + NMSUB c61, c31, b6, c61 + NMSUB c62, c32, b6, c62 + LD b5, BO, 28 * SIZE + NMSUB c71, c31, b7, c71 + NMSUB c72, c32, b7, c72 + LD b6, BO, 29 * SIZE + NMSUB c81, c31, b8, c81 + NMSUB c82, c32, b8, c82 + LD b7, BO, 30 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, BO, 31 * SIZE + NMSUB c51, c41, b5, c51 + NMSUB c52, c42, b5, c52 + LD b5, BO, 36 * SIZE + NMSUB c61, c41, b6, c61 + NMSUB c62, c42, b6, c62 + LD b6, BO, 37 * SIZE + NMSUB c71, c41, b7, c71 + NMSUB c72, c42, b7, c72 + LD b7, BO, 38 * SIZE + NMSUB c81, c41, b8, c81 + NMSUB c82, c42, b8, c82 + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + MUL c52, b5, c52 + NMSUB c61, c51, b6, c61 + NMSUB c62, c52, b6, c62 + LD b6, BO, 45 * SIZE + NMSUB c71, c51, b7, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 46 * SIZE + NMSUB c81, c51, b8, c81 + NMSUB c82, c52, b8, c82 + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + MUL c62, b6, c62 + NMSUB c71, c61, b7, c71 + NMSUB c72, c62, b7, c72 + LD b7, BO, 54 * SIZE + NMSUB c81, c61, b8, c81 + NMSUB c82, c62, b8, c82 + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + MUL c72, b7, c72 + NMSUB c81, c71, b8, c81 + NMSUB c82, c72, b8, c82 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, BO, 59 * SIZE + NMSUB c71, c81, b2, c71 + NMSUB c72, c82, b2, c72 + LD b6, BO, 58 * SIZE + NMSUB c61, c81, b3, c61 + NMSUB c62, c82, b3, c62 + LD b7, BO, 57 * SIZE + NMSUB c51, c81, b4, c51 + NMSUB c52, c82, b4, c52 + LD b8, BO, 56 * SIZE + NMSUB c41, c81, b5, c41 + NMSUB c42, c82, b5, c42 + LD b2, BO, 54 * SIZE + NMSUB c31, c81, b6, c31 + NMSUB c32, c82, b6, c32 + LD b3, BO, 53 * SIZE + NMSUB c21, c81, b7, c21 + NMSUB c22, c82, b7, c22 + LD b4, BO, 52 * SIZE + NMSUB c11, c81, b8, c11 + NMSUB c12, c82, b8, c12 + LD b5, BO, 51 * SIZE + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, BO, 50 * SIZE + NMSUB c61, c71, b3, c61 + NMSUB c62, c72, b3, c62 + LD b7, BO, 49 * SIZE + NMSUB c51, c71, b4, c51 + NMSUB c52, c72, b4, c52 + LD b8, BO, 48 * SIZE + NMSUB c41, c71, b5, c41 + NMSUB c42, c72, b5, c42 + LD b3, BO, 45 * SIZE + NMSUB c31, c71, b6, c31 + NMSUB c32, c72, b6, c32 + LD b4, BO, 44 * SIZE + NMSUB c21, c71, b7, c21 + NMSUB c22, c72, b7, c22 + LD b5, BO, 43 * SIZE + NMSUB c11, c71, b8, c11 + NMSUB c12, c72, b8, c12 + LD b6, BO, 42 * SIZE + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, BO, 41 * SIZE + NMSUB c51, c61, b4, c51 + NMSUB c52, c62, b4, c52 + LD b8, BO, 40 * SIZE + NMSUB c41, c61, b5, c41 + NMSUB c42, c62, b5, c42 + LD b4, BO, 36 * SIZE + NMSUB c31, c61, b6, c31 + NMSUB c32, c62, b6, c32 + LD b5, BO, 35 * SIZE + NMSUB c21, c61, b7, c21 + NMSUB c22, c62, b7, c22 + LD b6, BO, 34 * SIZE + NMSUB c11, c61, b8, c11 + NMSUB c12, c62, b8, c12 + LD b7, BO, 33 * SIZE + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, BO, 32 * SIZE + NMSUB c41, c51, b5, c41 + NMSUB c42, c52, b5, c42 + LD b5, BO, 27 * SIZE + NMSUB c31, c51, b6, c31 + NMSUB c32, c52, b6, c32 + LD b6, BO, 26 * SIZE + NMSUB c21, c51, b7, c21 + NMSUB c22, c52, b7, c22 + LD b7, BO, 25 * SIZE + NMSUB c11, c51, b8, c11 + NMSUB c12, c52, b8, c12 + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + LD b6, BO, 18 * SIZE + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + LD b7, BO, 17 * SIZE + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + LD b7, BO, 9 * SIZE + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE + ST c12, BO, 8 * SIZE + ST c22, BO, 9 * SIZE + ST c32, BO, 10 * SIZE + ST c42, BO, 11 * SIZE + ST c52, BO, 12 * SIZE + ST c62, BO, 13 * SIZE + ST c72, BO, 14 * SIZE + ST c82, BO, 15 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE + ST c51, AO, 8 * SIZE + ST c52, AO, 9 * SIZE + ST c61, AO, 10 * SIZE + ST c62, AO, 11 * SIZE + ST c71, AO, 12 * SIZE + ST c72, AO, 13 * SIZE + ST c81, AO, 14 * SIZE + ST c82, AO, 15 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE + ST c51, CO5, 0 * SIZE + ST c52, CO5, 1 * SIZE + ST c61, CO6, 0 * SIZE + ST c62, CO6, 1 * SIZE + ST c71, CO7, 0 * SIZE + ST c72, CO7, 1 * SIZE + ST c81, CO8, 0 * SIZE + ST c82, CO8, 1 * SIZE +MTC a1, $r0 +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE + addi.d CO5, CO5, 2 * SIZE + addi.d CO6, CO6, 2 * SIZE + addi.d CO7, CO7, 2 * SIZE + addi.d CO8, CO8, 2 * SIZE +#endif + MOV c11, a1 + MOV c21, a1 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, a1 + MOV c41, a1 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + MOV c51, a1 +MOV c61, a1 + blt $r0, I, .L11 + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 3 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 8 +#endif +#ifdef RT + addi.d KK, KK, -8 +#endif + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 +#ifdef RT + slli.d TEMP, K, 2 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + MOV c21, c11 + add.d CO4, CO3, LDC + MOV c31, c11 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif + andi I, M, 1 +MOV c41, c11 + bge $r0, I, .L40 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L45 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +MTC c11, $r0 +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE +#endif + MOV c21, c11 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif + MOV c31, c11 +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L40: + srai.d I, M, 1 + MOV c61, c11 +MOV c41, c11 + bge $r0, I, .L49 +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE + addi.d CO3, CO3, -2 * SIZE + addi.d CO4, CO4, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c12, BO, 4 * SIZE + ST c22, BO, 5 * SIZE + ST c32, BO, 6 * SIZE + ST c42, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L49: +#ifdef LN + slli.d TEMP, K, 2 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + .align 3 + +.L50: + andi J, N, 2 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT +#else + move AO, A +#endif + bge $r0, J, .L70 +#ifdef RT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif + move AO, A + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + andi I, M, 1 + bge $r0, I, .L60 +#if defined(LT) || defined(RN) + srai.d L, KK, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + srai.d L, TEMP, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif +#if defined(LN) || defined(LT) + LD b3, AO, 0 * SIZE + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + MUL c21, b3, c21 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + NMSUB c11, c21, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 0 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L60: + srai.d I, M, 1 + bge $r0, I, .L69 +.L51: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + MUL c22, b1, c22 + NMSUB c11, c21, b2, c11 + NMSUB c12, c22, b2, c12 + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c12, BO, 2 * SIZE + ST c22, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L51 + .align 3 + +.L69: +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L70: + andi J, N, 1 + bge $r0, J, .L999 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif + move AO, A + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + andi I, M, 1 + bge $r0, I, .L80 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L85 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + ADD c11, c11, c21 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -1 +#endif + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + SUB c11, b1, c11 +#else + LD b1, AO, 0 * SIZE + SUB c11, b1, c11 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE +#else + ST c11, AO, 0 * SIZE +#endif + ST c11, CO1, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L80: + srai.d I, M, 1 + bge $r0, I, .L89 +.L71: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + NMSUB c11, c12, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c12, c11, b2, c12 + MUL c12, b3, c12 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + blt $r0, I, .L71 + .align 3 + +.L89: +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + LDARG $r29, $sp, 88 + LDARG $r30, $sp, 96 + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#ifndef __64BIT__ + fld.d $f18, $sp, 112 + fld.d $f19, $sp, 120 + fld.d $f20, $sp, 128 + fld.d $f21, $sp, 136 +#endif + addi.d $sp, $sp, 144 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_LT.S b/kernel/loongarch64/trsm_kernel_LT.S new file mode 100644 index 000000000..aa6822c32 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_LT.S @@ -0,0 +1,2854 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r29 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define KK $r30 +#define TEMP $r20 +#define AORIG $r16 +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -144 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + SDARG $r29, $sp, 88 + SDARG $r30, $sp, 96 + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#ifndef __64BIT__ + fst.d $f18, $sp, 112 + fst.d $f19, $sp, 120 + fst.d $f20, $sp, 128 + fst.d $f21, $sp, 136 +#endif + slli.d LDC, LDC, BASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, BASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + srai.d J, N, 3 +nop + bge $r0, J, .L30 +.L10: +#ifdef RT + slli.d TEMP, K, 3 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 3 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO8, LDC +#endif +MOV c61, c11 + bge $r0, I, .L20 +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + srai.d L, TEMP, 2 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + LD b5, BO, 4 * SIZE + SUB c21, b2, c21 + LD b6, BO, 5 * SIZE + SUB c31, b3, c31 + LD b7, BO, 6 * SIZE + SUB c41, b4, c41 + LD b8, BO, 7 * SIZE + SUB c51, b5, c51 + LD b1, BO, 8 * SIZE + SUB c61, b6, c61 + LD b2, BO, 9 * SIZE + SUB c71, b7, c71 + LD b3, BO, 10 * SIZE + SUB c81, b8, c81 + LD b4, BO, 11 * SIZE + SUB c12, b1, c12 + LD b5, BO, 12 * SIZE + SUB c22, b2, c22 + LD b6, BO, 13 * SIZE + SUB c32, b3, c32 + LD b7, BO, 14 * SIZE + SUB c42, b4, c42 + LD b8, BO, 15 * SIZE + SUB c52, b5, c52 +#ifdef LN + LD b1, AO, 3 * SIZE +#else + LD b1, AO, 0 * SIZE +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + LD b5, AO, 4 * SIZE + SUB c12, b2, c12 + LD b6, AO, 5 * SIZE + SUB c21, b3, c21 + LD b7, AO, 6 * SIZE + SUB c22, b4, c22 + LD b8, AO, 7 * SIZE + SUB c31, b5, c31 + LD b1, AO, 8 * SIZE + SUB c32, b6, c32 + LD b2, AO, 9 * SIZE + SUB c41, b7, c41 + LD b3, AO, 10 * SIZE + SUB c42, b8, c42 + LD b4, AO, 11 * SIZE + LD b5, AO, 12 * SIZE + SUB c51, b1, c51 + LD b6, AO, 13 * SIZE + SUB c52, b2, c52 + LD b7, AO, 14 * SIZE + SUB c61, b3, c61 + LD b8, AO, 15 * SIZE + SUB c62, b4, c62 + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif +#ifdef LN + MUL c12, b1, c12 + LD b2, AO, 2 * SIZE + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + NMSUB c11, c12, b2, c11 + LD b3, AO, 0 * SIZE + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + NMSUB c51, c52, b2, c51 + NMSUB c61, c62, b2, c61 + NMSUB c71, c72, b2, c71 + NMSUB c81, c82, b2, c81 + MUL c11, b3, c11 + addi.d CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + addi.d CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + addi.d CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + addi.d CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + addi.d CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + addi.d CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + addi.d CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + addi.d CO8, CO8, -2 * SIZE +#endif +#ifdef LT + MUL c11, b1, c11 + LD b2, AO, 1 * SIZE + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + NMSUB c12, c11, b2, c12 + LD b3, AO, 3 * SIZE + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + NMSUB c52, c51, b2, c52 + NMSUB c62, c61, b2, c62 + NMSUB c72, c71, b2, c72 + NMSUB c82, c81, b2, c82 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, BO, 4 * SIZE + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + LD b6, BO, 5 * SIZE + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + LD b7, BO, 6 * SIZE + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b8, BO, 7 * SIZE + NMSUB c51, c11, b5, c51 + NMSUB c52, c12, b5, c52 + LD b2, BO, 9 * SIZE + NMSUB c61, c11, b6, c61 + NMSUB c62, c12, b6, c62 + LD b3, BO, 10 * SIZE + NMSUB c71, c11, b7, c71 + NMSUB c72, c12, b7, c72 + LD b4, BO, 11 * SIZE + NMSUB c81, c11, b8, c81 + NMSUB c82, c12, b8, c82 + LD b5, BO, 12 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, BO, 13 * SIZE + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + LD b7, BO, 14 * SIZE + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b8, BO, 15 * SIZE + NMSUB c51, c21, b5, c51 + NMSUB c52, c22, b5, c52 + LD b3, BO, 18 * SIZE + NMSUB c61, c21, b6, c61 + NMSUB c62, c22, b6, c62 + LD b4, BO, 19 * SIZE + NMSUB c71, c21, b7, c71 + NMSUB c72, c22, b7, c72 + LD b5, BO, 20 * SIZE + NMSUB c81, c21, b8, c81 + NMSUB c82, c22, b8, c82 + LD b6, BO, 21 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, BO, 22 * SIZE + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b8, BO, 23 * SIZE + NMSUB c51, c31, b5, c51 + NMSUB c52, c32, b5, c52 + LD b4, BO, 27 * SIZE + NMSUB c61, c31, b6, c61 + NMSUB c62, c32, b6, c62 + LD b5, BO, 28 * SIZE + NMSUB c71, c31, b7, c71 + NMSUB c72, c32, b7, c72 + LD b6, BO, 29 * SIZE + NMSUB c81, c31, b8, c81 + NMSUB c82, c32, b8, c82 + LD b7, BO, 30 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, BO, 31 * SIZE + NMSUB c51, c41, b5, c51 + NMSUB c52, c42, b5, c52 + LD b5, BO, 36 * SIZE + NMSUB c61, c41, b6, c61 + NMSUB c62, c42, b6, c62 + LD b6, BO, 37 * SIZE + NMSUB c71, c41, b7, c71 + NMSUB c72, c42, b7, c72 + LD b7, BO, 38 * SIZE + NMSUB c81, c41, b8, c81 + NMSUB c82, c42, b8, c82 + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + MUL c52, b5, c52 + NMSUB c61, c51, b6, c61 + NMSUB c62, c52, b6, c62 + LD b6, BO, 45 * SIZE + NMSUB c71, c51, b7, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 46 * SIZE + NMSUB c81, c51, b8, c81 + NMSUB c82, c52, b8, c82 + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + MUL c62, b6, c62 + NMSUB c71, c61, b7, c71 + NMSUB c72, c62, b7, c72 + LD b7, BO, 54 * SIZE + NMSUB c81, c61, b8, c81 + NMSUB c82, c62, b8, c82 + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + MUL c72, b7, c72 + NMSUB c81, c71, b8, c81 + NMSUB c82, c72, b8, c82 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, BO, 59 * SIZE + NMSUB c71, c81, b2, c71 + NMSUB c72, c82, b2, c72 + LD b6, BO, 58 * SIZE + NMSUB c61, c81, b3, c61 + NMSUB c62, c82, b3, c62 + LD b7, BO, 57 * SIZE + NMSUB c51, c81, b4, c51 + NMSUB c52, c82, b4, c52 + LD b8, BO, 56 * SIZE + NMSUB c41, c81, b5, c41 + NMSUB c42, c82, b5, c42 + LD b2, BO, 54 * SIZE + NMSUB c31, c81, b6, c31 + NMSUB c32, c82, b6, c32 + LD b3, BO, 53 * SIZE + NMSUB c21, c81, b7, c21 + NMSUB c22, c82, b7, c22 + LD b4, BO, 52 * SIZE + NMSUB c11, c81, b8, c11 + NMSUB c12, c82, b8, c12 + LD b5, BO, 51 * SIZE + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, BO, 50 * SIZE + NMSUB c61, c71, b3, c61 + NMSUB c62, c72, b3, c62 + LD b7, BO, 49 * SIZE + NMSUB c51, c71, b4, c51 + NMSUB c52, c72, b4, c52 + LD b8, BO, 48 * SIZE + NMSUB c41, c71, b5, c41 + NMSUB c42, c72, b5, c42 + LD b3, BO, 45 * SIZE + NMSUB c31, c71, b6, c31 + NMSUB c32, c72, b6, c32 + LD b4, BO, 44 * SIZE + NMSUB c21, c71, b7, c21 + NMSUB c22, c72, b7, c22 + LD b5, BO, 43 * SIZE + NMSUB c11, c71, b8, c11 + NMSUB c12, c72, b8, c12 + LD b6, BO, 42 * SIZE + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, BO, 41 * SIZE + NMSUB c51, c61, b4, c51 + NMSUB c52, c62, b4, c52 + LD b8, BO, 40 * SIZE + NMSUB c41, c61, b5, c41 + NMSUB c42, c62, b5, c42 + LD b4, BO, 36 * SIZE + NMSUB c31, c61, b6, c31 + NMSUB c32, c62, b6, c32 + LD b5, BO, 35 * SIZE + NMSUB c21, c61, b7, c21 + NMSUB c22, c62, b7, c22 + LD b6, BO, 34 * SIZE + NMSUB c11, c61, b8, c11 + NMSUB c12, c62, b8, c12 + LD b7, BO, 33 * SIZE + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, BO, 32 * SIZE + NMSUB c41, c51, b5, c41 + NMSUB c42, c52, b5, c42 + LD b5, BO, 27 * SIZE + NMSUB c31, c51, b6, c31 + NMSUB c32, c52, b6, c32 + LD b6, BO, 26 * SIZE + NMSUB c21, c51, b7, c21 + NMSUB c22, c52, b7, c22 + LD b7, BO, 25 * SIZE + NMSUB c11, c51, b8, c11 + NMSUB c12, c52, b8, c12 + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + LD b6, BO, 18 * SIZE + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + LD b7, BO, 17 * SIZE + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + LD b7, BO, 9 * SIZE + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE + ST c12, BO, 8 * SIZE + ST c22, BO, 9 * SIZE + ST c32, BO, 10 * SIZE + ST c42, BO, 11 * SIZE + ST c52, BO, 12 * SIZE + ST c62, BO, 13 * SIZE + ST c72, BO, 14 * SIZE + ST c82, BO, 15 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE + ST c51, AO, 8 * SIZE + ST c52, AO, 9 * SIZE + ST c61, AO, 10 * SIZE + ST c62, AO, 11 * SIZE + ST c71, AO, 12 * SIZE + ST c72, AO, 13 * SIZE + ST c81, AO, 14 * SIZE + ST c82, AO, 15 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE + ST c51, CO5, 0 * SIZE + ST c52, CO5, 1 * SIZE + ST c61, CO6, 0 * SIZE + ST c62, CO6, 1 * SIZE + ST c71, CO7, 0 * SIZE + ST c72, CO7, 1 * SIZE + ST c81, CO8, 0 * SIZE + ST c82, CO8, 1 * SIZE +MTC a1, $r0 +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE + addi.d CO5, CO5, 2 * SIZE + addi.d CO6, CO6, 2 * SIZE + addi.d CO7, CO7, 2 * SIZE + addi.d CO8, CO8, 2 * SIZE +#endif + MOV c11, a1 + MOV c21, a1 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, a1 + MOV c41, a1 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + MOV c51, a1 +MOV c61, a1 + blt $r0, I, .L11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, 0 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + MOV c81, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + NMSUB c51, c11, b5, c51 + NMSUB c61, c11, b6, c61 + NMSUB c71, c11, b7, c71 + NMSUB c81, c11, b8, c81 + LD b2, BO, 9 * SIZE + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + NMSUB c51, c21, b5, c51 + NMSUB c61, c21, b6, c61 + NMSUB c71, c21, b7, c71 + NMSUB c81, c21, b8, c81 + LD b3, BO, 18 * SIZE + LD b4, BO, 19 * SIZE + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + NMSUB c51, c31, b5, c51 + NMSUB c61, c31, b6, c61 + NMSUB c71, c31, b7, c71 + NMSUB c81, c31, b8, c81 + LD b4, BO, 27 * SIZE + LD b5, BO, 28 * SIZE + LD b6, BO, 29 * SIZE + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL c41, b4, c41 + NMSUB c51, c41, b5, c51 + NMSUB c61, c41, b6, c61 + NMSUB c71, c41, b7, c71 + NMSUB c81, c41, b8, c81 + LD b5, BO, 36 * SIZE + LD b6, BO, 37 * SIZE + LD b7, BO, 38 * SIZE + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + NMSUB c61, c51, b6, c61 + NMSUB c71, c51, b7, c71 + NMSUB c81, c51, b8, c81 + LD b6, BO, 45 * SIZE + LD b7, BO, 46 * SIZE + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + NMSUB c71, c61, b7, c71 + NMSUB c81, c61, b8, c81 + LD b7, BO, 54 * SIZE + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + NMSUB c81, c71, b8, c81 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + LD b5, BO, 59 * SIZE + LD b6, BO, 58 * SIZE + LD b7, BO, 57 * SIZE + LD b8, BO, 56 * SIZE + MUL c81, b1, c81 + NMSUB c71, c81, b2, c71 + NMSUB c61, c81, b3, c61 + NMSUB c51, c81, b4, c51 + NMSUB c41, c81, b5, c41 + NMSUB c31, c81, b6, c31 + NMSUB c21, c81, b7, c21 + NMSUB c11, c81, b8, c11 + LD b2, BO, 54 * SIZE + LD b3, BO, 53 * SIZE + LD b4, BO, 52 * SIZE + LD b5, BO, 51 * SIZE + LD b6, BO, 50 * SIZE + LD b7, BO, 49 * SIZE + LD b8, BO, 48 * SIZE + MUL c71, b2, c71 + NMSUB c61, c71, b3, c61 + NMSUB c51, c71, b4, c51 + NMSUB c41, c71, b5, c41 + NMSUB c31, c71, b6, c31 + NMSUB c21, c71, b7, c21 + NMSUB c11, c71, b8, c11 + LD b3, BO, 45 * SIZE + LD b4, BO, 44 * SIZE + LD b5, BO, 43 * SIZE + LD b6, BO, 42 * SIZE + LD b7, BO, 41 * SIZE + LD b8, BO, 40 * SIZE + MUL c61, b3, c61 + NMSUB c51, c61, b4, c51 + NMSUB c41, c61, b5, c41 + NMSUB c31, c61, b6, c31 + NMSUB c21, c61, b7, c21 + NMSUB c11, c61, b8, c11 + LD b4, BO, 36 * SIZE + LD b5, BO, 35 * SIZE + LD b6, BO, 34 * SIZE + LD b7, BO, 33 * SIZE + LD b8, BO, 32 * SIZE + MUL c51, b4, c51 + NMSUB c41, c51, b5, c41 + NMSUB c31, c51, b6, c31 + NMSUB c21, c51, b7, c21 + NMSUB c11, c51, b8, c11 + LD b5, BO, 27 * SIZE + LD b6, BO, 26 * SIZE + LD b7, BO, 25 * SIZE + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 18 * SIZE + LD b7, BO, 17 * SIZE + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE + addi.d CO5, CO5, -1 * SIZE + addi.d CO6, CO6, -1 * SIZE + addi.d CO7, CO7, -1 * SIZE + addi.d CO8, CO8, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c61, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c81, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE + addi.d CO5, CO5, 1 * SIZE + addi.d CO6, CO6, 1 * SIZE + addi.d CO7, CO7, 1 * SIZE + addi.d CO8, CO8, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 3 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 8 +#endif +#ifdef RT + addi.d KK, KK, -8 +#endif + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 +#ifdef RT + slli.d TEMP, K, 2 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + srai.d I, M, 1 + MOV c31, c11 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c41, c11 + bge $r0, I, .L40 +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE + addi.d CO3, CO3, -2 * SIZE + addi.d CO4, CO4, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c12, BO, 4 * SIZE + ST c22, BO, 5 * SIZE + ST c32, BO, 6 * SIZE + ST c42, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L45 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L49: +#ifdef LN + slli.d TEMP, K, 2 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + .align 3 + +.L50: + andi J, N, 2 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT +#else + move AO, A +#endif + bge $r0, J, .L70 +#ifdef RT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif + move AO, A + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L60 +.L51: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + MUL c22, b1, c22 + NMSUB c11, c21, b2, c11 + NMSUB c12, c22, b2, c12 + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c12, BO, 2 * SIZE + ST c22, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L51 + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 +#if defined(LT) || defined(RN) + srai.d L, KK, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + srai.d L, TEMP, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif +#if defined(LN) || defined(LT) + LD b3, AO, 0 * SIZE + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + MUL c21, b3, c21 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + NMSUB c11, c21, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 0 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L69: +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L70: + andi J, N, 1 + bge $r0, J, .L999 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif + move AO, A + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L80 +.L71: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + NMSUB c11, c12, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c12, c11, b2, c12 + MUL c12, b3, c12 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + blt $r0, I, .L71 + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L85 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + ADD c11, c11, c21 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -1 +#endif + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + SUB c11, b1, c11 +#else + LD b1, AO, 0 * SIZE + SUB c11, b1, c11 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE +#else + ST c11, AO, 0 * SIZE +#endif + ST c11, CO1, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L89: +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + LDARG $r29, $sp, 88 + LDARG $r30, $sp, 96 + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#ifndef __64BIT__ + fld.d $f18, $sp, 112 + fld.d $f19, $sp, 120 + fld.d $f20, $sp, 128 + fld.d $f21, $sp, 136 +#endif + addi.d $sp, $sp, 144 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_RT.S b/kernel/loongarch64/trsm_kernel_RT.S new file mode 100644 index 000000000..c86d9c1e5 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_RT.S @@ -0,0 +1,2850 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r29 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define KK $r30 +#define TEMP $r20 +#define AORIG $r16 +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -144 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + SDARG $r29, $sp, 88 + SDARG $r30, $sp, 96 + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#ifndef __64BIT__ + fst.d $f18, $sp, 112 + fst.d $f19, $sp, 120 + fst.d $f20, $sp, 128 + fst.d $f21, $sp, 136 +#endif + slli.d LDC, LDC, BASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, BASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + andi J, N, 1 + bge $r0, J, .L30 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif + move AO, A + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L80 +.L71: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + NMSUB c11, c12, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c12, c11, b2, c12 + MUL c12, b3, c12 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + blt $r0, I, .L71 + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + MOV c21, c11 + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L85 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MOV c21, c11 + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + ADD c11, c11, c21 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -1 +#endif + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + SUB c11, b1, c11 +#else + LD b1, AO, 0 * SIZE + SUB c11, b1, c11 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE +#else + ST c11, AO, 0 * SIZE +#endif + ST c11, CO1, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L89: +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L30: + andi J, N, 2 + bge $r0, J, .L50 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif + move AO, A + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L60 +.L51: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + MUL c22, b1, c22 + NMSUB c11, c21, b2, c11 + NMSUB c12, c22, b2, c12 + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c12, BO, 2 * SIZE + ST c22, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L51 + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 +#if defined(LT) || defined(RN) + srai.d L, KK, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + srai.d L, TEMP, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif +#if defined(LN) || defined(LT) + LD b3, AO, 0 * SIZE + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + MUL c21, b3, c21 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + NMSUB c11, c21, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 0 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L69: +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L50: + andi J, N, 4 +move AO, A + bge $r0, J, .L70 +#ifdef RT + slli.d TEMP, K, 2 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + srai.d I, M, 1 + MOV c31, c11 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c41, c11 + bge $r0, I, .L40 +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE + addi.d CO3, CO3, -2 * SIZE + addi.d CO4, CO4, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c12, BO, 4 * SIZE + ST c22, BO, 5 * SIZE + ST c32, BO, 6 * SIZE + ST c42, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L45 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L49: +#ifdef LN + slli.d TEMP, K, 2 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + .align 3 + +.L70: + srai.d J, N, 3 +nop + bge $r0, J, .L999 +.L10: +#ifdef RT + slli.d TEMP, K, 3 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 3 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO8, LDC +#endif +MOV c61, c11 + bge $r0, I, .L20 +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + LD b5, BO, 4 * SIZE + SUB c21, b2, c21 + LD b6, BO, 5 * SIZE + SUB c31, b3, c31 + LD b7, BO, 6 * SIZE + SUB c41, b4, c41 + LD b8, BO, 7 * SIZE + SUB c51, b5, c51 + LD b1, BO, 8 * SIZE + SUB c61, b6, c61 + LD b2, BO, 9 * SIZE + SUB c71, b7, c71 + LD b3, BO, 10 * SIZE + SUB c81, b8, c81 + LD b4, BO, 11 * SIZE + SUB c12, b1, c12 + LD b5, BO, 12 * SIZE + SUB c22, b2, c22 + LD b6, BO, 13 * SIZE + SUB c32, b3, c32 + LD b7, BO, 14 * SIZE + SUB c42, b4, c42 + LD b8, BO, 15 * SIZE + SUB c52, b5, c52 +#ifdef LN + LD b1, AO, 3 * SIZE +#else + LD b1, AO, 0 * SIZE +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + LD b5, AO, 4 * SIZE + SUB c12, b2, c12 + LD b6, AO, 5 * SIZE + SUB c21, b3, c21 + LD b7, AO, 6 * SIZE + SUB c22, b4, c22 + LD b8, AO, 7 * SIZE + SUB c31, b5, c31 + LD b1, AO, 8 * SIZE + SUB c32, b6, c32 + LD b2, AO, 9 * SIZE + SUB c41, b7, c41 + LD b3, AO, 10 * SIZE + SUB c42, b8, c42 + LD b4, AO, 11 * SIZE + LD b5, AO, 12 * SIZE + SUB c51, b1, c51 + LD b6, AO, 13 * SIZE + SUB c52, b2, c52 + LD b7, AO, 14 * SIZE + SUB c61, b3, c61 + LD b8, AO, 15 * SIZE + SUB c62, b4, c62 + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif +#ifdef LN + MUL c12, b1, c12 + LD b2, AO, 2 * SIZE + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + NMSUB c11, c12, b2, c11 + LD b3, AO, 0 * SIZE + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + NMSUB c51, c52, b2, c51 + NMSUB c61, c62, b2, c61 + NMSUB c71, c72, b2, c71 + NMSUB c81, c82, b2, c81 + MUL c11, b3, c11 + addi.d CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + addi.d CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + addi.d CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + addi.d CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + addi.d CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + addi.d CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + addi.d CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + addi.d CO8, CO8, -2 * SIZE +#endif +#ifdef LT + MUL c11, b1, c11 + LD b2, AO, 1 * SIZE + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + NMSUB c12, c11, b2, c12 + LD b3, AO, 3 * SIZE + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + NMSUB c52, c51, b2, c52 + NMSUB c62, c61, b2, c62 + NMSUB c72, c71, b2, c72 + NMSUB c82, c81, b2, c82 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, BO, 4 * SIZE + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + LD b6, BO, 5 * SIZE + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + LD b7, BO, 6 * SIZE + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b8, BO, 7 * SIZE + NMSUB c51, c11, b5, c51 + NMSUB c52, c12, b5, c52 + LD b2, BO, 9 * SIZE + NMSUB c61, c11, b6, c61 + NMSUB c62, c12, b6, c62 + LD b3, BO, 10 * SIZE + NMSUB c71, c11, b7, c71 + NMSUB c72, c12, b7, c72 + LD b4, BO, 11 * SIZE + NMSUB c81, c11, b8, c81 + NMSUB c82, c12, b8, c82 + LD b5, BO, 12 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, BO, 13 * SIZE + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + LD b7, BO, 14 * SIZE + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b8, BO, 15 * SIZE + NMSUB c51, c21, b5, c51 + NMSUB c52, c22, b5, c52 + LD b3, BO, 18 * SIZE + NMSUB c61, c21, b6, c61 + NMSUB c62, c22, b6, c62 + LD b4, BO, 19 * SIZE + NMSUB c71, c21, b7, c71 + NMSUB c72, c22, b7, c72 + LD b5, BO, 20 * SIZE + NMSUB c81, c21, b8, c81 + NMSUB c82, c22, b8, c82 + LD b6, BO, 21 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, BO, 22 * SIZE + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b8, BO, 23 * SIZE + NMSUB c51, c31, b5, c51 + NMSUB c52, c32, b5, c52 + LD b4, BO, 27 * SIZE + NMSUB c61, c31, b6, c61 + NMSUB c62, c32, b6, c62 + LD b5, BO, 28 * SIZE + NMSUB c71, c31, b7, c71 + NMSUB c72, c32, b7, c72 + LD b6, BO, 29 * SIZE + NMSUB c81, c31, b8, c81 + NMSUB c82, c32, b8, c82 + LD b7, BO, 30 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, BO, 31 * SIZE + NMSUB c51, c41, b5, c51 + NMSUB c52, c42, b5, c52 + LD b5, BO, 36 * SIZE + NMSUB c61, c41, b6, c61 + NMSUB c62, c42, b6, c62 + LD b6, BO, 37 * SIZE + NMSUB c71, c41, b7, c71 + NMSUB c72, c42, b7, c72 + LD b7, BO, 38 * SIZE + NMSUB c81, c41, b8, c81 + NMSUB c82, c42, b8, c82 + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + MUL c52, b5, c52 + NMSUB c61, c51, b6, c61 + NMSUB c62, c52, b6, c62 + LD b6, BO, 45 * SIZE + NMSUB c71, c51, b7, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 46 * SIZE + NMSUB c81, c51, b8, c81 + NMSUB c82, c52, b8, c82 + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + MUL c62, b6, c62 + NMSUB c71, c61, b7, c71 + NMSUB c72, c62, b7, c72 + LD b7, BO, 54 * SIZE + NMSUB c81, c61, b8, c81 + NMSUB c82, c62, b8, c82 + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + MUL c72, b7, c72 + NMSUB c81, c71, b8, c81 + NMSUB c82, c72, b8, c82 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, BO, 59 * SIZE + NMSUB c71, c81, b2, c71 + NMSUB c72, c82, b2, c72 + LD b6, BO, 58 * SIZE + NMSUB c61, c81, b3, c61 + NMSUB c62, c82, b3, c62 + LD b7, BO, 57 * SIZE + NMSUB c51, c81, b4, c51 + NMSUB c52, c82, b4, c52 + LD b8, BO, 56 * SIZE + NMSUB c41, c81, b5, c41 + NMSUB c42, c82, b5, c42 + LD b2, BO, 54 * SIZE + NMSUB c31, c81, b6, c31 + NMSUB c32, c82, b6, c32 + LD b3, BO, 53 * SIZE + NMSUB c21, c81, b7, c21 + NMSUB c22, c82, b7, c22 + LD b4, BO, 52 * SIZE + NMSUB c11, c81, b8, c11 + NMSUB c12, c82, b8, c12 + LD b5, BO, 51 * SIZE + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, BO, 50 * SIZE + NMSUB c61, c71, b3, c61 + NMSUB c62, c72, b3, c62 + LD b7, BO, 49 * SIZE + NMSUB c51, c71, b4, c51 + NMSUB c52, c72, b4, c52 + LD b8, BO, 48 * SIZE + NMSUB c41, c71, b5, c41 + NMSUB c42, c72, b5, c42 + LD b3, BO, 45 * SIZE + NMSUB c31, c71, b6, c31 + NMSUB c32, c72, b6, c32 + LD b4, BO, 44 * SIZE + NMSUB c21, c71, b7, c21 + NMSUB c22, c72, b7, c22 + LD b5, BO, 43 * SIZE + NMSUB c11, c71, b8, c11 + NMSUB c12, c72, b8, c12 + LD b6, BO, 42 * SIZE + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, BO, 41 * SIZE + NMSUB c51, c61, b4, c51 + NMSUB c52, c62, b4, c52 + LD b8, BO, 40 * SIZE + NMSUB c41, c61, b5, c41 + NMSUB c42, c62, b5, c42 + LD b4, BO, 36 * SIZE + NMSUB c31, c61, b6, c31 + NMSUB c32, c62, b6, c32 + LD b5, BO, 35 * SIZE + NMSUB c21, c61, b7, c21 + NMSUB c22, c62, b7, c22 + LD b6, BO, 34 * SIZE + NMSUB c11, c61, b8, c11 + NMSUB c12, c62, b8, c12 + LD b7, BO, 33 * SIZE + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, BO, 32 * SIZE + NMSUB c41, c51, b5, c41 + NMSUB c42, c52, b5, c42 + LD b5, BO, 27 * SIZE + NMSUB c31, c51, b6, c31 + NMSUB c32, c52, b6, c32 + LD b6, BO, 26 * SIZE + NMSUB c21, c51, b7, c21 + NMSUB c22, c52, b7, c22 + LD b7, BO, 25 * SIZE + NMSUB c11, c51, b8, c11 + NMSUB c12, c52, b8, c12 + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + LD b6, BO, 18 * SIZE + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + LD b7, BO, 17 * SIZE + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + LD b7, BO, 9 * SIZE + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE + ST c12, BO, 8 * SIZE + ST c22, BO, 9 * SIZE + ST c32, BO, 10 * SIZE + ST c42, BO, 11 * SIZE + ST c52, BO, 12 * SIZE + ST c62, BO, 13 * SIZE + ST c72, BO, 14 * SIZE + ST c82, BO, 15 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE + ST c51, AO, 8 * SIZE + ST c52, AO, 9 * SIZE + ST c61, AO, 10 * SIZE + ST c62, AO, 11 * SIZE + ST c71, AO, 12 * SIZE + ST c72, AO, 13 * SIZE + ST c81, AO, 14 * SIZE + ST c82, AO, 15 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE + ST c51, CO5, 0 * SIZE + ST c52, CO5, 1 * SIZE + ST c61, CO6, 0 * SIZE + ST c62, CO6, 1 * SIZE + ST c71, CO7, 0 * SIZE + ST c72, CO7, 1 * SIZE + ST c81, CO8, 0 * SIZE + ST c82, CO8, 1 * SIZE +MTC a1, $r0 +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE + addi.d CO5, CO5, 2 * SIZE + addi.d CO6, CO6, 2 * SIZE + addi.d CO7, CO7, 2 * SIZE + addi.d CO8, CO8, 2 * SIZE +#endif + MOV c11, a1 + MOV c21, a1 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, a1 + MOV c41, a1 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + MOV c51, a1 +MOV c61, a1 + blt $r0, I, .L11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, 0 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + MOV c81, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + NMSUB c51, c11, b5, c51 + NMSUB c61, c11, b6, c61 + NMSUB c71, c11, b7, c71 + NMSUB c81, c11, b8, c81 + LD b2, BO, 9 * SIZE + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + NMSUB c51, c21, b5, c51 + NMSUB c61, c21, b6, c61 + NMSUB c71, c21, b7, c71 + NMSUB c81, c21, b8, c81 + LD b3, BO, 18 * SIZE + LD b4, BO, 19 * SIZE + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + NMSUB c51, c31, b5, c51 + NMSUB c61, c31, b6, c61 + NMSUB c71, c31, b7, c71 + NMSUB c81, c31, b8, c81 + LD b4, BO, 27 * SIZE + LD b5, BO, 28 * SIZE + LD b6, BO, 29 * SIZE + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL c41, b4, c41 + NMSUB c51, c41, b5, c51 + NMSUB c61, c41, b6, c61 + NMSUB c71, c41, b7, c71 + NMSUB c81, c41, b8, c81 + LD b5, BO, 36 * SIZE + LD b6, BO, 37 * SIZE + LD b7, BO, 38 * SIZE + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + NMSUB c61, c51, b6, c61 + NMSUB c71, c51, b7, c71 + NMSUB c81, c51, b8, c81 + LD b6, BO, 45 * SIZE + LD b7, BO, 46 * SIZE + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + NMSUB c71, c61, b7, c71 + NMSUB c81, c61, b8, c81 + LD b7, BO, 54 * SIZE + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + NMSUB c81, c71, b8, c81 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + LD b5, BO, 59 * SIZE + LD b6, BO, 58 * SIZE + LD b7, BO, 57 * SIZE + LD b8, BO, 56 * SIZE + MUL c81, b1, c81 + NMSUB c71, c81, b2, c71 + NMSUB c61, c81, b3, c61 + NMSUB c51, c81, b4, c51 + NMSUB c41, c81, b5, c41 + NMSUB c31, c81, b6, c31 + NMSUB c21, c81, b7, c21 + NMSUB c11, c81, b8, c11 + LD b2, BO, 54 * SIZE + LD b3, BO, 53 * SIZE + LD b4, BO, 52 * SIZE + LD b5, BO, 51 * SIZE + LD b6, BO, 50 * SIZE + LD b7, BO, 49 * SIZE + LD b8, BO, 48 * SIZE + MUL c71, b2, c71 + NMSUB c61, c71, b3, c61 + NMSUB c51, c71, b4, c51 + NMSUB c41, c71, b5, c41 + NMSUB c31, c71, b6, c31 + NMSUB c21, c71, b7, c21 + NMSUB c11, c71, b8, c11 + LD b3, BO, 45 * SIZE + LD b4, BO, 44 * SIZE + LD b5, BO, 43 * SIZE + LD b6, BO, 42 * SIZE + LD b7, BO, 41 * SIZE + LD b8, BO, 40 * SIZE + MUL c61, b3, c61 + NMSUB c51, c61, b4, c51 + NMSUB c41, c61, b5, c41 + NMSUB c31, c61, b6, c31 + NMSUB c21, c61, b7, c21 + NMSUB c11, c61, b8, c11 + LD b4, BO, 36 * SIZE + LD b5, BO, 35 * SIZE + LD b6, BO, 34 * SIZE + LD b7, BO, 33 * SIZE + LD b8, BO, 32 * SIZE + MUL c51, b4, c51 + NMSUB c41, c51, b5, c41 + NMSUB c31, c51, b6, c31 + NMSUB c21, c51, b7, c21 + NMSUB c11, c51, b8, c11 + LD b5, BO, 27 * SIZE + LD b6, BO, 26 * SIZE + LD b7, BO, 25 * SIZE + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 18 * SIZE + LD b7, BO, 17 * SIZE + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE + addi.d CO5, CO5, -1 * SIZE + addi.d CO6, CO6, -1 * SIZE + addi.d CO7, CO7, -1 * SIZE + addi.d CO8, CO8, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c61, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c81, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE + addi.d CO5, CO5, 1 * SIZE + addi.d CO6, CO6, 1 * SIZE + addi.d CO7, CO7, 1 * SIZE + addi.d CO8, CO8, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 3 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 8 +#endif +#ifdef RT + addi.d KK, KK, -8 +#endif + blt $r0, J, .L10 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + LDARG $r29, $sp, 88 + LDARG $r30, $sp, 96 + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#ifndef __64BIT__ + fld.d $f18, $sp, 112 + fld.d $f19, $sp, 120 + fld.d $f20, $sp, 128 + fld.d $f21, $sp, 136 +#endif + addi.d $sp, $sp, 144 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zamax.S b/kernel/loongarch64/zamax.S new file mode 100644 index 000000000..f998bdc23 --- /dev/null +++ b/kernel/loongarch64/zamax.S @@ -0,0 +1,190 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + bge $r0, N, .L999 + ADD s2, t1, t2 + srai.d I, N, 2 + ADD s3, t1, t2 + ADD s4, t1, t2 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t3 + LD a8, X, 1 * SIZE + CMPLT $fcc2, s3, t5 + add.d X, X, INCX + CMPLT $fcc3, s4, t7 + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t3, $fcc1 + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t3 + CMPLT $fcc2, s3, t5 + CMPLT $fcc3, s4, t7 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t3, $fcc1 + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zamin.S b/kernel/loongarch64/zamin.S new file mode 100644 index 000000000..bde9aebf8 --- /dev/null +++ b/kernel/loongarch64/zamin.S @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + bge $r0, N, .L999 + NOP + ADD s2, t1, t2 + srai.d I, N, 2 + ADD s3, t1, t2 + ADD s4, t1, t2 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + NOP + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + NOP + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + NOP + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t3, s2 + LD a8, X, 1 * SIZE + CMPLT $fcc2, t5, s3 + add.d X, X, INCX + CMPLT $fcc3, t7, s4 + NOP + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t3, $fcc1 + NOP + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + blt $r0, I, .L12 + NOP + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t3, s2 + CMPLT $fcc2, t5, s3 + CMPLT $fcc3, t7, s4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t3, $fcc1 + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + NOP + + EPILOGUE diff --git a/kernel/loongarch64/zasum.S b/kernel/loongarch64/zasum.S new file mode 100644 index 000000000..d1a1a732c --- /dev/null +++ b/kernel/loongarch64/zasum.S @@ -0,0 +1,158 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f23 +#define a2 $f9 +#define a3 $f10 +#define a4 $f11 +#define a5 $f12 +#define a6 $f13 +#define a7 $f14 +#define a8 $f15 +#define t1 $f16 +#define t2 $f17 +#define t3 $f0 +#define t4 $f1 +#define s1 $f22 +#define s2 $f8 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + MTC s2, $r0 + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bge $r0, N, .L999 + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + FABS t3, a3 + FABS t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 + +.L23: + ADD s1, s1, t1 + LD a1, X, 0 * SIZE + FABS t1, a5 + addi.d I, I, -1 + ADD s2, s2, t2 + LD a2, X, 1 * SIZE + FABS t2, a6 + add.d X, X, INCX + ADD s1, s1, t3 + LD a3, X, 0 * SIZE + FABS t3, a7 + NOP + ADD s2, s2, t4 + LD a4, X, 1 * SIZE + FABS t4, a8 + add.d X, X, INCX + ADD s1, s1, t1 + LD a5, X, 0 * SIZE + FABS t1, a1 + NOP + ADD s2, s2, t2 + LD a6, X, 1 * SIZE + FABS t2, a2 + add.d X, X, INCX + ADD s1, s1, t3 + LD a7, X, 0 * SIZE + FABS t3, a3 + LD a8, X, 1 * SIZE + ADD s2, s2, t4 + add.d X, X, INCX + FABS t4, a4 + blt $r0, I, .L23 + .align 3 + +.L24: + ADD s1, s1, t1 + FABS t1, a5 + ADD s2, s2, t2 + FABS t2, a6 + ADD s1, s1, t3 + FABS t3, a7 + ADD s2, s2, t4 + FABS t4, a8 + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS t1, a1 + addi.d I, I, -1 + FABS t2, a2 + add.d X, X, INCX + ADD s1, s1, t1 + ADD s2, s2, t2 + blt $r0, I, .L26 + .align 3 + +.L999: + ADD s1, s1, s2 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zcopy.S b/kernel/loongarch64/zcopy.S new file mode 100644 index 000000000..3fbe56074 --- /dev/null +++ b/kernel/loongarch64/zcopy.S @@ -0,0 +1,217 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + li TEMP, 2 * SIZE + NOP + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, N, .L999 + slli.d INCY, INCY, ZBASE_SHIFT + bne INCX, TEMP, .L20 + srai.d I, N, 2 + bne INCY, TEMP, .L20 + addi.d I, I, -1 + blt I, $r0, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + LD a7, X, 6 * SIZE + LD a8, X, 7 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: + ST a1, Y, 0 * SIZE + LD a1, X, 8 * SIZE + ST a2, Y, 1 * SIZE + LD a2, X, 9 * SIZE + ST a3, Y, 2 * SIZE + LD a3, X, 10 * SIZE + ST a4, Y, 3 * SIZE + LD a4, X, 11 * SIZE + ST a5, Y, 4 * SIZE + LD a5, X, 12 * SIZE + ST a6, Y, 5 * SIZE + LD a6, X, 13 * SIZE + ST a7, Y, 6 * SIZE + LD a7, X, 14 * SIZE + ST a8, Y, 7 * SIZE + LD a8, X, 15 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, Y, 2 * SIZE + ST a4, Y, 3 * SIZE + ST a5, Y, 4 * SIZE + ST a6, Y, 5 * SIZE + ST a7, Y, 6 * SIZE + ST a8, Y, 7 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + ST a1, Y, -2 * SIZE + addi.d I, I, -1 + ST a2, Y, -1 * SIZE + blt $r0, I, .L16 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + NOP + .align 3 + +.L20: + srai.d I, N, 2 + addi.d I, I, -1 + blt I, $r0, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + add.d X, X, INCX + bge $r0, I, .L23 + .align 3 + +.L22: + ST a1, Y, 0 * SIZE + LD a1, X, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST a3, Y, 0 * SIZE + LD a3, X, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a5, Y, 0 * SIZE + LD a5, X, 0 * SIZE + ST a6, Y, 1 * SIZE + add.d Y, Y, INCY + LD a6, X, 1 * SIZE + add.d X, X, INCX + ST a7, Y, 0 * SIZE + LD a7, X, 0 * SIZE + ST a8, Y, 1 * SIZE + add.d Y, Y, INCY + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L23: + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + ST a5, Y, 0 * SIZE + ST a6, Y, 1 * SIZE + add.d Y, Y, INCY + ST a7, Y, 0 * SIZE + ST a8, Y, 1 * SIZE + add.d Y, Y, INCY + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L26 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zdot.S b/kernel/loongarch64/zdot.S new file mode 100644 index 000000000..087c3845f --- /dev/null +++ b/kernel/loongarch64/zdot.S @@ -0,0 +1,330 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define b1 $f14 +#define b2 $f15 +#define b3 $f16 +#define b4 $f17 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + MTC s1, $r0 + MOV s2, s1 + MOV s3, s2 + MOV s4, s3 + slli.d INCX, INCX, ZBASE_SHIFT + li TEMP, 2 * SIZE + slli.d INCY, INCY, ZBASE_SHIFT + bge $r0, N, .L999 + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + addi.d I, I, -1 + LD b2, Y, 1 * SIZE + bge $r0, I, .L14 + .align 3 + +.L13: + MADD s1, b1, a1, s1 + LD a3, X, 2 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 3 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 2 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 3 * SIZE + MADD s1, b3, a3, s1 + LD a1, X, 4 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 5 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 4 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 5 * SIZE + MADD s1, b1, a1, s1 + LD a3, X, 6 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 7 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 6 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 7 * SIZE + MADD s1, b3, a3, s1 + LD a1, X, 8 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 9 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 8 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 9 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L13 + .align 3 + +.L14: + MADD s1, b1, a1, s1 + LD a3, X, 2 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 3 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 2 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 3 * SIZE + MADD s1, b3, a3, s1 + LD a1, X, 4 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 5 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 4 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 5 * SIZE + MADD s1, b1, a1, s1 + LD a3, X, 6 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 7 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 6 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 7 * SIZE + MADD s1, b3, a3, s1 + addi.d X, X, 8 * SIZE + MADD s2, b3, a4, s2 + addi.d Y, Y, 8 * SIZE + MADD s3, b4, a3, s3 + MADD s4, b4, a4, s4 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L999 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + addi.d I, I, -1 + LD b2, Y, 1 * SIZE + bge $r0, I, .L17 + .align 3 + +.L16: + MADD s1, b1, a1, s1 + addi.d I, I, -1 + MADD s2, b1, a2, s2 + LD b1, Y, 2 * SIZE + MADD s3, b2, a1, s3 + LD a1, X, 2 * SIZE + MADD s4, b2, a2, s4 + LD a2, X, 3 * SIZE + LD b2, Y, 3 * SIZE + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L16 + .align 3 + +.L17: + MADD s1, b1, a1, s1 + MADD s2, b1, a2, s2 + MADD s3, b2, a1, s3 + MADD s4, b2, a2, s4 + b .L999 + .align 3 + +.L20: +#ifdef F_INTERFACE + bgez INCX, .L21 + addi.d TEMP, N, -1 + mult TEMP, INCX + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + addi.d TEMP, N, -1 + mult TEMP, INCY + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + add.d Y, Y, INCY + bge $r0, I, .L24 + .align 3 + +.L23: + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b3, a3, s1 + LD a1, X, 0 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 1 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 0 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b3, a3, s1 + LD a1, X, 0 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 1 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 0 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + add.d Y, Y, INCY + blt $r0, I, .L23 + .align 3 + +.L24: + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b3, a3, s1 + LD a1, X, 0 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 1 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 0 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + MADD s1, b3, a3, s1 + add.d X, X, INCX + MADD s2, b3, a4, s2 + add.d Y, Y, INCY + MADD s3, b4, a3, s3 + MADD s4, b4, a4, s4 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + MADD s1, b1, a1, s1 + MADD s2, b1, a2, s2 + MADD s3, b2, a1, s3 + MADD s4, b2, a2, s4 + add.d X, X, INCX + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L26 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 +#else + ADD $f0, s1, s4 +#endif +#ifndef CONJ + ADD $f1, s3, s2 +#else + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemm3m_kernel.S b/kernel/loongarch64/zgemm3m_kernel.S new file mode 100644 index 000000000..f9acb6cfc --- /dev/null +++ b/kernel/loongarch64/zgemm3m_kernel.S @@ -0,0 +1,1359 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r11 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 + +#define a1 $f22 +#define a2 $f8 +#define a3 $f28 +#define a4 $f29 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f4 +#define c31 $f2 +#define c32 $f5 +#define c41 $f6 +#define c42 $f7 +#define c51 $f18 +#define c52 $f19 +#define c61 $f20 +#define c62 $f21 +#define c71 $f24 +#define c72 $f25 +#define c81 $f26 +#define c82 $f27 +#define ALPHA_R $f0 +#define ALPHA_I $f1 + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + fst.d $f29, $sp, 88 + slli.d LDC, LDC, ZBASE_SHIFT + srai.d J, N, 3 + bge $r0, J, .L30 +.L10: + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + move AO, A + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 + add.d C, CO8, LDC +MOV c61, c11 + bge $r0, I, .L20 +.L11: + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, K, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD a4, AO, 2 * SIZE + MADD c61, b2, a1, c61 + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD a4, AO, 6 * SIZE + MADD c61, b2, a3, c61 + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: + andi L, K, 3 + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + LD $f10, CO2, 0 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + LD $f11, CO2, 1 * SIZE + MADD $f8, c11, ALPHA_I, $f8 + LD $f12, CO2, 2 * SIZE + MADD $f23, c12, ALPHA_R, $f23 + LD $f13, CO2, 3 * SIZE + MADD $f9, c12, ALPHA_I, $f9 + MADD $f10, c21, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c21, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c22, ALPHA_R, $f12 + ST $f23, CO1, 2 * SIZE + MADD $f13, c22, ALPHA_I, $f13 + ST $f9, CO1, 3 * SIZE + LD $f22, CO3, 0 * SIZE + LD $f8, CO3, 1 * SIZE + LD $f23, CO3, 2 * SIZE + LD $f9, CO3, 3 * SIZE + ST $f10, CO2, 0 * SIZE + ST $f11, CO2, 1 * SIZE + ST $f12, CO2, 2 * SIZE + ST $f13, CO2, 3 * SIZE + LD $f10, CO4, 0 * SIZE + LD $f11, CO4, 1 * SIZE + LD $f12, CO4, 2 * SIZE + LD $f13, CO4, 3 * SIZE + MADD $f22, c31, ALPHA_R, $f22 + MADD $f8, c31, ALPHA_I, $f8 + MADD $f23, c32, ALPHA_R, $f23 + MADD $f9, c32, ALPHA_I, $f9 + MADD $f10, c41, ALPHA_R, $f10 + ST $f22, CO3, 0 * SIZE + MADD $f11, c41, ALPHA_I, $f11 + ST $f8, CO3, 1 * SIZE + MADD $f12, c42, ALPHA_R, $f12 + ST $f23, CO3, 2 * SIZE + MADD $f13, c42, ALPHA_I, $f13 + ST $f9, CO3, 3 * SIZE + LD $f22, CO5, 0 * SIZE + LD $f8, CO5, 1 * SIZE + LD $f23, CO5, 2 * SIZE + LD $f9, CO5, 3 * SIZE + ST $f10, CO4, 0 * SIZE + ST $f11, CO4, 1 * SIZE + ST $f12, CO4, 2 * SIZE + ST $f13, CO4, 3 * SIZE + LD $f10, CO6, 0 * SIZE + LD $f11, CO6, 1 * SIZE + LD $f12, CO6, 2 * SIZE + LD $f13, CO6, 3 * SIZE + MADD $f22, c51, ALPHA_R, $f22 + addi.d CO1,CO1, 4 * SIZE + MADD $f8, c51, ALPHA_I, $f8 + addi.d CO2,CO2, 4 * SIZE + MADD $f23, c52, ALPHA_R, $f23 + addi.d CO3,CO3, 4 * SIZE + MADD $f9, c52, ALPHA_I, $f9 + addi.d CO4,CO4, 4 * SIZE + MADD $f10, c61, ALPHA_R, $f10 + ST $f22, CO5, 0 * SIZE + MADD $f11, c61, ALPHA_I, $f11 + ST $f8, CO5, 1 * SIZE + MADD $f12, c62, ALPHA_R, $f12 + ST $f23, CO5, 2 * SIZE + MADD $f13, c62, ALPHA_I, $f13 + ST $f9, CO5, 3 * SIZE + LD $f22, CO7, 0 * SIZE + LD $f8, CO7, 1 * SIZE + LD $f23, CO7, 2 * SIZE + LD $f9, CO7, 3 * SIZE + ST $f10, CO6, 0 * SIZE + ST $f11, CO6, 1 * SIZE + ST $f12, CO6, 2 * SIZE + ST $f13, CO6, 3 * SIZE + LD $f10, CO8, 0 * SIZE + addi.d I, I, -1 + LD $f11, CO8, 1 * SIZE +MTC c11, $r0 + LD $f12, CO8, 2 * SIZE + LD $f13, CO8, 3 * SIZE + MADD $f22, c71, ALPHA_R, $f22 + addi.d CO5,CO5, 4 * SIZE + MADD $f8, c71, ALPHA_I, $f8 + addi.d CO6,CO6, 4 * SIZE + MADD $f23, c72, ALPHA_R, $f23 + addi.d CO7,CO7, 4 * SIZE + MADD $f9, c72, ALPHA_I, $f9 + addi.d CO8,CO8, 4 * SIZE + MADD $f10, c81, ALPHA_R, $f10 + ST $f22, CO7, -4 * SIZE + MADD $f11, c81, ALPHA_I, $f11 + ST $f8, CO7, -3 * SIZE + MADD $f12, c82, ALPHA_R, $f12 + ST $f23, CO7, -2 * SIZE + MADD $f13, c82, ALPHA_I, $f13 + ST $f9, CO7, -1 * SIZE + ST $f10, CO8, -4 * SIZE + MOV c21, c11 + ST $f11, CO8, -3 * SIZE + MOV c31, c11 + ST $f12, CO8, -2 * SIZE + MOV c41, c11 + ST $f13, CO8, -1 * SIZE + MOV c51, c11 +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: + andi L, K, 3 + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + LD $f10, CO3, 0 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + LD $f11, CO3, 1 * SIZE + MADD $f8, c11, ALPHA_I, $f8 + LD $f12, CO4, 0 * SIZE + MADD $f23, c21, ALPHA_R, $f23 + LD $f13, CO4, 1 * SIZE + MADD $f9, c21, ALPHA_I, $f9 + MADD $f10, c31, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c31, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c41, ALPHA_R, $f12 + ST $f23, CO2, 0 * SIZE + MADD $f13, c41, ALPHA_I, $f13 + ST $f9, CO2, 1 * SIZE + LD $f22, CO5, 0 * SIZE + LD $f8, CO5, 1 * SIZE + LD $f23, CO6, 0 * SIZE + LD $f9, CO6, 1 * SIZE + ST $f10, CO3, 0 * SIZE + ST $f11, CO3, 1 * SIZE + ST $f12, CO4, 0 * SIZE + ST $f13, CO4, 1 * SIZE + LD $f10, CO7, 0 * SIZE + MADD $f22, c51, ALPHA_R, $f22 + LD $f11, CO7, 1 * SIZE + MADD $f8, c51, ALPHA_I, $f8 + LD $f12, CO8, 0 * SIZE + MADD $f23, c61, ALPHA_R, $f23 + LD $f13, CO8, 1 * SIZE + MADD $f9, c61, ALPHA_I, $f9 + MADD $f10, c71, ALPHA_R, $f10 + ST $f22, CO5, 0 * SIZE + MADD $f11, c71, ALPHA_I, $f11 + ST $f8, CO5, 1 * SIZE + MADD $f12, c81, ALPHA_R, $f12 + ST $f23, CO6, 0 * SIZE + MADD $f13, c81, ALPHA_I, $f13 + ST $f9, CO6, 1 * SIZE + ST $f10, CO7, 0 * SIZE + ST $f11, CO7, 1 * SIZE + ST $f12, CO8, 0 * SIZE + ST $f13, CO8, 1 * SIZE + .align 3 + +.L29: +move B, BO + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + add.d C, CO4, LDC + MOV c31, c11 + srai.d I, M, 1 +MOV c41, c11 + bge $r0, I, .L40 +.L31: + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: + andi L, K, 3 + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + LD $f10, CO2, 0 * SIZE + LD $f11, CO2, 1 * SIZE + LD $f12, CO2, 2 * SIZE + LD $f13, CO2, 3 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + MADD $f23, c12, ALPHA_R, $f23 + MADD $f9, c12, ALPHA_I, $f9 + MADD $f10, c21, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c21, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c22, ALPHA_R, $f12 + ST $f23, CO1, 2 * SIZE + MADD $f13, c22, ALPHA_I, $f13 + ST $f9, CO1, 3 * SIZE + LD $f22, CO3, 0 * SIZE + LD $f8, CO3, 1 * SIZE + LD $f23, CO3, 2 * SIZE + LD $f9, CO3, 3 * SIZE + ST $f10, CO2, 0 * SIZE + MADD $f22, c31, ALPHA_R, $f22 + ST $f11, CO2, 1 * SIZE + MADD $f8, c31, ALPHA_I, $f8 + ST $f12, CO2, 2 * SIZE + MADD $f23, c32, ALPHA_R, $f23 + ST $f13, CO2, 3 * SIZE + MADD $f9, c32, ALPHA_I, $f9 + LD $f10, CO4, 0 * SIZE + LD $f11, CO4, 1 * SIZE + LD $f12, CO4, 2 * SIZE + LD $f13, CO4, 3 * SIZE + MADD $f10, c41, ALPHA_R, $f10 + addi.d CO1,CO1, 4 * SIZE + MADD $f11, c41, ALPHA_I, $f11 + addi.d CO2,CO2, 4 * SIZE + MADD $f12, c42, ALPHA_R, $f12 + addi.d CO3,CO3, 4 * SIZE + MADD $f13, c42, ALPHA_I, $f13 + addi.d CO4,CO4, 4 * SIZE + ST $f22, CO3, -4 * SIZE + addi.d I, I, -1 + ST $f8, CO3, -3 * SIZE + ST $f23, CO3, -2 * SIZE + ST $f9, CO3, -1 * SIZE + ST $f10, CO4, -4 * SIZE +MTC c11, $r0 + ST $f11, CO4, -3 * SIZE + MOV c21, c11 + ST $f12, CO4, -2 * SIZE + MOV c31, c11 + ST $f13, CO4, -1 * SIZE +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L45 + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: + andi L, K, 3 + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + LD $f10, CO3, 0 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + LD $f11, CO3, 1 * SIZE + MADD $f8, c11, ALPHA_I, $f8 + LD $f12, CO4, 0 * SIZE + MADD $f23, c21, ALPHA_R, $f23 + LD $f13, CO4, 1 * SIZE + MADD $f9, c21, ALPHA_I, $f9 + MADD $f10, c31, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c31, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c41, ALPHA_R, $f12 + ST $f23, CO2, 0 * SIZE + MADD $f13, c41, ALPHA_I, $f13 + ST $f9, CO2, 1 * SIZE + ST $f10, CO3, 0 * SIZE + ST $f11, CO3, 1 * SIZE + ST $f12, CO4, 0 * SIZE + ST $f13, CO4, 1 * SIZE + .align 3 + +.L49: + move B, BO + .align 3 + +.L50: + andi J, N, 2 +move AO, A + bge $r0, J, .L70 + move CO1, C + add.d CO2, C, LDC + srai.d I, M, 1 +add.d C, CO2, LDC + bge $r0, I, .L60 +.L51: + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: + andi L, K, 3 + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + LD $f10, CO2, 0 * SIZE + LD $f11, CO2, 1 * SIZE + LD $f12, CO2, 2 * SIZE + LD $f13, CO2, 3 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + addi.d I, I, -1 + MADD $f8, c11, ALPHA_I, $f8 + addi.d CO1,CO1, 4 * SIZE + MADD $f23, c12, ALPHA_R, $f23 + addi.d CO2,CO2, 4 * SIZE + MADD $f9, c12, ALPHA_I, $f9 + MADD $f10, c21, ALPHA_R, $f10 + MADD $f11, c21, ALPHA_I, $f11 + MADD $f12, c22, ALPHA_R, $f12 + MADD $f13, c22, ALPHA_I, $f13 + ST $f22, CO1, -4 * SIZE + ST $f8, CO1, -3 * SIZE + ST $f23, CO1, -2 * SIZE + ST $f9, CO1, -1 * SIZE + ST $f10, CO2, -4 * SIZE + ST $f11, CO2, -3 * SIZE + ST $f12, CO2, -2 * SIZE + ST $f13, CO2, -1 * SIZE + blt $r0, I, .L51 + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 + srai.d L, K, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: + andi L, K, 3 + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + ADD c11, c11, c31 + ADD c21, c21, c41 + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + MADD $f23, c21, ALPHA_R, $f23 + MADD $f9, c21, ALPHA_I, $f9 + ST $f22, CO1, 0 * SIZE + ST $f8, CO1, 1 * SIZE + ST $f23, CO2, 0 * SIZE + ST $f9, CO2, 1 * SIZE + .align 3 + +.L69: + move B, BO + .align 3 + +.L70: + andi J, N, 1 +move AO, A + bge $r0, J, .L999 + move CO1, C + srai.d I, M, 1 +add.d C, CO1, LDC + bge $r0, I, .L80 +.L71: + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: + andi L, K, 3 + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + ADD c11, c11, c21 + addi.d I, I, -1 + ADD c12, c12, c22 + addi.d CO1,CO1, 4 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + MADD $f23, c12, ALPHA_R, $f23 + MADD $f9, c12, ALPHA_I, $f9 + ST $f22, CO1, -4 * SIZE + ST $f8, CO1, -3 * SIZE + ST $f23, CO1, -2 * SIZE + ST $f9, CO1, -1 * SIZE + blt $r0, I, .L71 + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L85 + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: + andi L, K, 3 + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + ADD c11, c11, c21 + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + ST $f22, CO1, 0 * SIZE + ST $f8, CO1, 1 * SIZE + .align 3 + +.L89: + move B, BO + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + fld.d $f29, $sp, 88 + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemm_kernel.S b/kernel/loongarch64/zgemm_kernel.S new file mode 100644 index 000000000..2d50d41a5 --- /dev/null +++ b/kernel/loongarch64/zgemm_kernel.S @@ -0,0 +1,1047 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r25 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 + +#if defined(TRMMKERNEL) +#define OFFSET $r11 +#define KK $r26 +#define TEMP $r27 +#endif + +#define a1 $f22 +#define a2 $f8 +#define a3 $f28 +#define a4 $f29 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f4 +#define c31 $f2 +#define c32 $f5 +#define c41 $f6 +#define c42 $f7 +#define c51 $f18 +#define c52 $f19 +#define c61 $f20 +#define c62 $f21 +#define c71 $f24 +#define c72 $f25 +#define c81 $f26 +#define c82 $f27 +#define ALPHA_R $f0 +#define ALPHA_I $f1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 64 + fst.d $f24, $sp, 16 + fst.d $f25, $sp, 24 + fst.d $f26, $sp, 32 + fst.d $f27, $sp, 40 + fst.d $f28, $sp, 48 + fst.d $f29, $sp, 56 +#if defined(TRMMKERNEL) + SDARG $r26, $sp, 72 + SDARG $r27, $sp, 80 +#endif +#ifndef __64BIT__ + fst.d $f18, $sp, 88 + fst.d $f19, $sp, 96 + fst.d $f20, $sp, 104 + fst.d $f21, $sp, 112 +#endif + slli.d LDC, LDC, ZBASE_SHIFT +#if defined(TRMMKERNEL) && !defined(LEFT) + sub.d KK, $r0, OFFSET +#endif + srai.d J, N, 2 +nop + bge $r0, J, .L20 +.L10: + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + move AO, A + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + MOV c41, c11 + MOV c51, c11 + move I, M + add.d C, CO4, LDC + MOV c61, c11 + bge $r0, I, .L19 +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 4 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#else + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, K, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#endif + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + addi.d L, L, -1 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + addi.d L, L, -1 + MADD3 c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD3 c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#ifndef TRMMKERNEL + LD b1, CO1, 0 * SIZE + ADD c11, c11, c22 + LD b2, CO1, 1 * SIZE + ADD c12, c12, c21 + LD b3, CO2, 0 * SIZE + ADD c31, c31, c42 + LD b4, CO2, 1 * SIZE + ADD c32, c32, c41 + LD b5, CO3, 0 * SIZE + ADD c51, c51, c62 + LD b6, CO3, 1 * SIZE + ADD c52, c52, c61 + LD b7, CO4, 0 * SIZE + ADD c71, c71, c82 + LD b8, CO4, 1 * SIZE + ADD c72, c72, c81 + MADD b1, c11, ALPHA_R, b1 + addi.d CO1,CO1, 2 * SIZE + MADD b2, c12, ALPHA_R, b2 + addi.d CO2,CO2, 2 * SIZE + MADD b3, c31, ALPHA_R, b3 + addi.d CO3,CO3, 2 * SIZE + MADD b4, c32, ALPHA_R, b4 + addi.d CO4,CO4, 2 * SIZE + MADD b5, c51, ALPHA_R, b5 + addi.d I, I, -1 + MADD b6, c52, ALPHA_R, b6 + MADD b7, c71, ALPHA_R, b7 + MADD b8, c72, ALPHA_R, b8 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + NMSUB b5, c52, ALPHA_I, b5 + ST b2, CO1, -1 * SIZE + MADD b6, c51, ALPHA_I, b6 + ST b3, CO2, -2 * SIZE + NMSUB b7, c72, ALPHA_I, b7 + ST b4, CO2, -1 * SIZE + MADD b8, c71, ALPHA_I, b8 + ST b5, CO3, -2 * SIZE + MOV c21, c11 + ST b6, CO3, -1 * SIZE + MOV c31, c11 + ST b7, CO4, -2 * SIZE + MOV c41, c11 + ST b8, CO4, -1 * SIZE + MOV c51, c11 +#else + ADD c11, c11, c22 + addi.d CO1,CO1, 2 * SIZE + ADD c12, c12, c21 + addi.d CO2,CO2, 2 * SIZE + ADD c31, c31, c42 + addi.d CO3,CO3, 2 * SIZE + ADD c32, c32, c41 + addi.d CO4,CO4, 2 * SIZE + ADD c51, c51, c62 + addi.d I, I, -1 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 + MUL b1, ALPHA_R, c11 + MUL b2, ALPHA_R, c12 + MUL b3, ALPHA_R, c31 + MUL b4, ALPHA_R, c32 + MUL b5, ALPHA_R, c51 + MUL b6, ALPHA_R, c52 + MUL b7, ALPHA_R, c71 + MUL b8, ALPHA_R, c72 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + NMSUB b5, c52, ALPHA_I, b5 + ST b2, CO1, -1 * SIZE + MADD b6, c51, ALPHA_I, b6 + ST b3, CO2, -2 * SIZE + NMSUB b7, c72, ALPHA_I, b7 + ST b4, CO2, -1 * SIZE + MADD b8, c71, ALPHA_I, b8 + ST b5, CO3, -2 * SIZE + MOV c21, c11 + ST b6, CO3, -1 * SIZE + MOV c31, c11 + ST b7, CO4, -2 * SIZE + MOV c41, c11 + ST b8, CO4, -1 * SIZE + MOV c51, c11 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -4 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 4 +#endif +move B, BO + blt $r0, J, .L10 + .align 3 + +.L20: + andi J, N, 2 + MTC c11, $r0 +move CO1, C + bge $r0, J, .L30 + add.d CO2, C, LDC + add.d C, CO2, LDC +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + move I, M +move AO, A + bge $r0, I, .L29 + .align 3 + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + MOV c12, c11 + LD b4, BO, 3 * SIZE + MOV c22, c11 + LD b5, BO, 4 * SIZE + MOV c32, c11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 2 +#endif + srai.d L, TEMP, 2 +MOV c42, c11 + bge $r0, L, .L25 +#else + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + srai.d L, K, 2 + LD b3, B, 2 * SIZE + MOV c12, c11 + LD b4, B, 3 * SIZE + MOV c22, c11 + LD b5, B, 4 * SIZE + MOV c32, c11 + MOV c42, c11 +move BO, B + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 12 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c11, b5, a3, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 17 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 18 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 19 * SIZE +addi.d BO, BO, 16 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + addi.d BO, BO, 4 * SIZE + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 0 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 3 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L26 +.L28: +#ifndef TRMMKERNEL + LD b1, CO1, 0 * SIZE + ADD c11, c11, c22 + LD b2, CO1, 1 * SIZE + ADD c12, c12, c21 + LD b3, CO2, 0 * SIZE + ADD c31, c31, c42 + LD b4, CO2, 1 * SIZE + ADD c32, c32, c41 + MADD b1, c11, ALPHA_R, b1 + addi.d CO1,CO1, 2 * SIZE + MADD b2, c12, ALPHA_R, b2 + addi.d CO2,CO2, 2 * SIZE + MADD b3, c31, ALPHA_R, b3 + addi.d I, I, -1 + MADD b4, c32, ALPHA_R, b4 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + ST b3, CO2, -2 * SIZE +#else + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + MUL b1, ALPHA_R, c11 + addi.d CO1,CO1, 2 * SIZE + MUL b2, ALPHA_R, c12 + addi.d CO2,CO2, 2 * SIZE + MUL b3, ALPHA_R, c31 + addi.d I, I, -1 + MUL b4, ALPHA_R, c32 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + ST b3, CO2, -2 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -2 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + ST b4, CO2, -1 * SIZE + blt $r0, I, .L21 + .align 3 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 2 +#endif + move B, BO + .align 3 + +.L30: + andi J, N, 1 + MTC c11, $r0 +move CO1, C + bge $r0, J, .L999 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + move I, M + add.d C, CO1, LDC +move AO, A + bge $r0, I, .L39 + .align 3 + +.L31: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d TEMP, KK, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + MOV c12, c11 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, BO, 4 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 1 +#endif + srai.d L, TEMP, 2 +MOV c42, c11 + bge $r0, L, .L35 +#else + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + MOV c12, c11 + srai.d L, K, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, B, 4 * SIZE + MOV c42, c11 +move BO, B + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD1 c11, b1, a1, c11 + LD b4, BO, 3 * SIZE + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD b2, BO, 5 * SIZE + MADD3 c21, b4, a1, c21 + LD a1, AO, 8 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 5 * SIZE + MADD1 c11, b3, a3, c11 + LD b4, BO, 7 * SIZE + MADD3 c21, b2, a3, c21 + LD a3, AO, 6 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 6 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 7 * SIZE + MADD1 c11, b3, a3, c11 + LD b2, BO, 9 * SIZE + MADD3 c21, b4, a3, c21 + LD a3, AO, 12 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 12 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 9 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD1 c11, b1, a1, c11 + addi.d L, L, -1 + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + LD b2, BO, 3 * SIZE + addi.d BO, BO, 2 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L36 +.L38: +#ifndef TRMMKERNEL + LD b1, CO1, 0 * SIZE + ADD c11, c11, c22 + LD b2, CO1, 1 * SIZE + ADD c12, c12, c21 + MADD b1, c11, ALPHA_R, b1 + addi.d CO1,CO1, 2 * SIZE + MADD b2, c12, ALPHA_R, b2 + addi.d I, I, -1 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + blt $r0, I, .L31 +#else + ADD c11, c11, c22 + ADD c12, c12, c21 + MUL b1, ALPHA_R, c11 + addi.d CO1,CO1, 2 * SIZE + MUL b2, ALPHA_R, c12 + addi.d I, I, -1 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -1 +#endif + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + blt $r0, I, .L31 +#endif + .align 3 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 1 +#endif + move B, BO + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 64 + fld.d $f24, $sp, 16 + fld.d $f25, $sp, 24 + fld.d $f26, $sp, 32 + fld.d $f27, $sp, 40 + fld.d $f28, $sp, 48 + fld.d $f29, $sp, 56 +#if defined(TRMMKERNEL) + LDARG $r26, $sp, 72 + LDARG $r27, $sp, 80 +#endif +#ifndef __64BIT__ + fld.d $f18, $sp, 88 + fld.d $f19, $sp, 96 + fld.d $f20, $sp, 104 + fld.d $f21, $sp, 112 +#endif + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + fmov.d $f1, $f23 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemv_n.S b/kernel/loongarch64/zgemv_n.S new file mode 100644 index 000000000..0cc49c789 --- /dev/null +++ b/kernel/loongarch64/zgemv_n.S @@ -0,0 +1,648 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r17 + +#define YORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 + +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define x1 $f14 +#define x2 $f15 +#define x3 $f16 +#define x4 $f17 +#define y1 $f3 +#define y2 $f4 +#define y3 $f2 +#define y4 $f5 +#define t1 $f6 +#define t2 $f7 +#define t3 $f18 +#define t4 $f19 +#define t5 $f20 +#define t6 $f21 +#define t7 $f24 +#define t8 $f25 + +#if !defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#if defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif +#if !defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif +#if defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifndef __64BIT__ + addi.d $sp, $sp, -64 +#else + addi.d $sp, $sp, -32 +#endif + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + fst.d $f24, $sp, 16 + fst.d $f25, $sp, 24 +#ifndef __64BIT__ + fst.d $f18, $sp, 32 + fst.d $f19, $sp, 40 + fst.d $f20, $sp, 48 + fst.d $f21, $sp, 56 +#endif + slli.d LDA, LDA, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, ZBASE_SHIFT + bge $r0, N, .L999 + li I, 2 * SIZE + move YORIG, Y + beq INCY, I, .L10 + srai.d I, M, 2 + move YORIG, BUFFER + move XX, Y + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + add.d XX, XX, INCY + LD a3, XX, 0 * SIZE + LD a4, XX, 1 * SIZE + add.d XX, XX, INCY + LD a5, XX, 0 * SIZE + LD a6, XX, 1 * SIZE + add.d XX, XX, INCY + LD a7, XX, 0 * SIZE + LD a8, XX, 1 * SIZE + add.d XX, XX, INCY + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + ST a1, YY, -8 * SIZE + ST a2, YY, -7 * SIZE + ST a3, YY, -6 * SIZE + ST a4, YY, -5 * SIZE + ST a5, YY, -4 * SIZE + ST a6, YY, -3 * SIZE + ST a7, YY, -2 * SIZE + ST a8, YY, -1 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + add.d XX, XX, INCY + addi.d I, I, -1 + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + addi.d YY, YY, 2 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + bge $r0, J, .L20 + .align 3 + +.L11: + LD x1, X, 0 * SIZE + LD x2, X, 1 * SIZE + add.d X, X, INCX + LD x3, X, 0 * SIZE + LD x4, X, 1 * SIZE + add.d X, X, INCX + MUL a1, ALPHA_R, x1 + move AO1, A + MUL a2, ALPHA_I, x1 + add.d AO2, A, LDA + MUL a3, ALPHA_R, x3 + add.d A, AO2, LDA + MUL a4, ALPHA_I, x3 +#ifndef XCONJ + NMSUB x1, x2, ALPHA_I, a1 + MADD x2, x2, ALPHA_R, a2 + NMSUB x3, x4, ALPHA_I, a3 + MADD x4, x4, ALPHA_R, a4 +#else + MADD x1, x2, ALPHA_I, a1 + MSUB x2, x2, ALPHA_R, a2 + MADD x3, x4, ALPHA_I, a3 + MSUB x4, x4, ALPHA_R, a4 +#endif + srai.d I, M, 2 + move YY, YORIG + bge $r0, I, .L15 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a2, AO1, 1 * SIZE + LD y4, YY, 3 * SIZE + LD a4, AO1, 3 * SIZE + LD a5, AO2, 0 * SIZE + LD a6, AO2, 1 * SIZE + LD a7, AO2, 2 * SIZE + LD a8, AO2, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 4 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 5 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 6 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 6 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 5 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 7 * SIZE + MADD4 t4, a4, x1, t4 + LD a4, AO1, 7 * SIZE + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + LD a5, AO2, 4 * SIZE + MADD1 t3, a7, x3, t3 + MADD2 t4, a7, x4, t4 + LD a7, AO2, 6 * SIZE + MADD3 t1, a6, x4, t1 + MADD4 t2, a6, x3, t2 + LD a6, AO2, 5 * SIZE + MADD3 t3, a8, x4, t3 + addi.d I, I, -1 + MADD4 t4, a8, x3, t4 + LD a8, AO2, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + MADD1 t5, a1, x1, y1 + LD y1, YY, 8 * SIZE + MADD2 t6, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD1 t7, a3, x1, y3 + LD y2, YY, 9 * SIZE + MADD2 t8, a3, x2, y4 + LD a3, AO1, 10 * SIZE + MADD3 t5, a2, x2, t5 + LD y3, YY, 10 * SIZE + MADD4 t6, a2, x1, t6 + LD a2, AO1, 9 * SIZE + MADD3 t7, a4, x2, t7 + LD y4, YY, 11 * SIZE + MADD4 t8, a4, x1, t8 + LD a4, AO1, 11 * SIZE + MADD1 t5, a5, x3, t5 + ST t1, YY, 0 * SIZE + MADD2 t6, a5, x4, t6 + LD a5, AO2, 8 * SIZE + MADD1 t7, a7, x3, t7 + ST t2, YY, 1 * SIZE + MADD2 t8, a7, x4, t8 + LD a7, AO2, 10 * SIZE + MADD3 t5, a6, x4, t5 + ST t3, YY, 2 * SIZE + MADD4 t6, a6, x3, t6 + LD a6, AO2, 9 * SIZE + MADD3 t7, a8, x4, t7 + ST t4, YY, 3 * SIZE + MADD4 t8, a8, x3, t8 + LD a8, AO2, 11 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 12 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 12 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 13 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 14 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 14 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 13 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 15 * SIZE + MADD4 t4, a4, x1, t4 + LD a4, AO1, 15 * SIZE + MADD1 t1, a5, x3, t1 + ST t5, YY, 4 * SIZE + MADD2 t2, a5, x4, t2 + LD a5, AO2, 12 * SIZE + MADD1 t3, a7, x3, t3 + ST t6, YY, 5 * SIZE + MADD2 t4, a7, x4, t4 + LD a7, AO2, 14 * SIZE + MADD3 t1, a6, x4, t1 + ST t7, YY, 6 * SIZE + MADD4 t2, a6, x3, t2 + LD a6, AO2, 13 * SIZE + MADD3 t3, a8, x4, t3 + ST t8, YY, 7 * SIZE + MADD4 t4, a8, x3, t4 + LD a8, AO2, 15 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d AO2, AO2, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST t1, YY, 0 * SIZE + MADD1 t1, a1, x1, y1 + ST t2, YY, 1 * SIZE + MADD2 t2, a1, x2, y2 + ST t3, YY, 2 * SIZE + MADD1 t3, a3, x1, y3 + ST t4, YY, 3 * SIZE + MADD2 t4, a3, x2, y4 + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + MADD1 t3, a7, x3, t3 + MADD2 t4, a7, x4, t4 + MADD3 t1, a6, x4, t1 + addi.d AO1, AO1, 8 * SIZE + MADD4 t2, a6, x3, t2 + addi.d AO2, AO2, 8 * SIZE + MADD3 t3, a8, x4, t3 + addi.d YY, YY, 8 * SIZE + MADD4 t4, a8, x3, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L15: + andi I, M, 2 + bge $r0, I, .L16 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD a5, AO2, 0 * SIZE + MADD2 t2, a1, x2, y2 + LD a6, AO2, 1 * SIZE + MADD1 t3, a3, x1, y3 + LD a7, AO2, 2 * SIZE + MADD2 t4, a3, x2, y4 + LD a8, AO2, 3 * SIZE + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + MADD1 t3, a7, x3, t3 + MADD2 t4, a7, x4, t4 + MADD3 t1, a6, x4, t1 + addi.d YY, YY, 4 * SIZE + MADD4 t2, a6, x3, t2 + addi.d AO1, AO1, 4 * SIZE + MADD3 t3, a8, x4, t3 + addi.d AO2, AO2, 4 * SIZE + MADD4 t4, a8, x3, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L16: + andi I, M, 1 + bge $r0, I, .L19 + LD y1, YY, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a1, AO1, 0 * SIZE + LD a2, AO1, 1 * SIZE + MADD1 t1, a1, x1, y1 + LD a5, AO2, 0 * SIZE + MADD2 t2, a1, x2, y2 + LD a6, AO2, 1 * SIZE + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + MADD3 t1, a6, x4, t1 + MADD4 t2, a6, x3, t2 + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + .align 3 + +.L19: + addi.d J, J, -1 + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + bge $r0, J, .L900 + LD x1, X, 0 * SIZE + LD x2, X, 1 * SIZE + add.d X, X, INCX + MUL a1, ALPHA_R, x1 + move AO1, A + MUL a2, ALPHA_I, x1 +#ifndef XCONJ + NMSUB x1, x2, ALPHA_I, a1 + MADD x2, x2, ALPHA_R, a2 +#else + MADD x1, x2, ALPHA_I, a1 + MSUB x2, x2, ALPHA_R, a2 +#endif + srai.d I, M, 2 + move YY, YORIG + bge $r0, I, .L25 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a2, AO1, 1 * SIZE + LD y4, YY, 3 * SIZE + LD a4, AO1, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 4 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 5 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 6 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 6 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 5 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 7 * SIZE + MADD4 t4, a4, x1, t4 + addi.d I, I, -1 + LD a4, AO1, 7 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD1 t5, a1, x1, y1 + LD y1, YY, 8 * SIZE + MADD2 t6, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD1 t7, a3, x1, y3 + LD y2, YY, 9 * SIZE + MADD2 t8, a3, x2, y4 + LD a3, AO1, 10 * SIZE + MADD3 t5, a2, x2, t5 + LD y3, YY, 10 * SIZE + MADD4 t6, a2, x1, t6 + LD a2, AO1, 9 * SIZE + MADD3 t7, a4, x2, t7 + LD y4, YY, 11 * SIZE + MADD4 t8, a4, x1, t8 + LD a4, AO1, 11 * SIZE + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + ST t3, YY, 2 * SIZE + ST t4, YY, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 12 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 12 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 13 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 14 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 14 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 13 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 15 * SIZE + MADD4 t4, a4, x1, t4 + LD a4, AO1, 15 * SIZE + ST t5, YY, 4 * SIZE + ST t6, YY, 5 * SIZE + ST t7, YY, 6 * SIZE + ST t8, YY, 7 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + ST t1, YY, 0 * SIZE + MADD1 t1, a1, x1, y1 + ST t2, YY, 1 * SIZE + MADD2 t2, a1, x2, y2 + ST t3, YY, 2 * SIZE + MADD1 t3, a3, x1, y3 + ST t4, YY, 3 * SIZE + MADD2 t4, a3, x2, y4 + MADD3 t1, a2, x2, t1 + addi.d AO1, AO1, 8 * SIZE + MADD4 t2, a2, x1, t2 + addi.d YY, YY, 8 * SIZE + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L25: + andi I, M, 2 + bge $r0, I, .L26 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + MADD1 t1, a1, x1, y1 + MADD2 t2, a1, x2, y2 + MADD1 t3, a3, x1, y3 + MADD2 t4, a3, x2, y4 + MADD3 t1, a2, x2, t1 + addi.d YY, YY, 4 * SIZE + MADD4 t2, a2, x1, t2 + addi.d AO1, AO1, 4 * SIZE + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L26: + andi I, M, 1 + bge $r0, I, .L900 + LD y1, YY, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a1, AO1, 0 * SIZE + LD a2, AO1, 1 * SIZE + MADD1 t1, a1, x1, y1 + MADD2 t2, a1, x2, y2 + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + .align 3 + +.L900: + li YORIG, 2 * SIZE + srai.d I, M, 2 + beq INCY, YORIG, .L999 + move XX, BUFFER + bge $r0, I, .L905 + .align 3 + +.L902: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + LD a3, XX, 2 * SIZE + LD a4, XX, 3 * SIZE + LD a5, XX, 4 * SIZE + LD a6, XX, 5 * SIZE + LD a7, XX, 6 * SIZE + LD a8, XX, 7 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + ST a5, Y, 0 * SIZE + ST a6, Y, 1 * SIZE + add.d Y, Y, INCY + ST a7, Y, 0 * SIZE + ST a8, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d XX, XX, 8 * SIZE + blt $r0, I, .L902 + .align 3 + +.L905: + andi I, M, 3 + bge $r0, I, .L999 + .align 3 + +.L906: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + addi.d XX, XX, 2 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L906 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + fld.d $f24, $sp, 16 + fld.d $f25, $sp, 24 +#ifndef __64BIT__ + fld.d $f18, $sp, 32 + fld.d $f19, $sp, 40 + fld.d $f20, $sp, 48 + fld.d $f21, $sp, 56 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 32 +#else + addi.d $sp, $sp, 64 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemv_t.S b/kernel/loongarch64/zgemv_t.S new file mode 100644 index 000000000..85a9a0c0d --- /dev/null +++ b/kernel/loongarch64/zgemv_t.S @@ -0,0 +1,556 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r17 + +#define XORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 + +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define y1 $f14 +#define y2 $f15 +#define y3 $f16 +#define y4 $f17 +#define x1 $f3 +#define x2 $f4 +#define x3 $f2 +#define x4 $f5 +#define x5 $f6 +#define x6 $f7 +#define x7 $f18 +#define x8 $f19 + +#if !defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#if defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif +#if !defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif +#if defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifdef __64BIT__ + addi.d $sp, $sp, -16 +#else + addi.d $sp, $sp, -32 +#endif + MTC y1, $r0 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + slli.d LDA, LDA, ZBASE_SHIFT +#ifndef __64BIT__ + fst.d $f18, $sp, 16 + fst.d $f19, $sp, 24 +#endif + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, ZBASE_SHIFT + bge $r0, N, .L999 + li I, 2 * SIZE + move XORIG, X + beq INCX, I, .L10 + srai.d I, M, 2 + move XORIG, BUFFER + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + ST a1, YY, -8 * SIZE + ST a2, YY, -7 * SIZE + ST a3, YY, -6 * SIZE + ST a4, YY, -5 * SIZE + ST a5, YY, -4 * SIZE + ST a6, YY, -3 * SIZE + ST a7, YY, -2 * SIZE + ST a8, YY, -1 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 2 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + move YY, Y + bge $r0, J, .L20 + .align 3 + +.L11: + move AO1, A + MOV y2, y1 + add.d AO2, A, LDA + MOV y3, y1 + add.d A, AO2, LDA + MOV y4, y1 + srai.d I, M, 2 + move XX, XORIG + bge $r0, I, .L15 + LD x1, XX, 0 * SIZE + LD x2, XX, 1 * SIZE + LD x4, XX, 3 * SIZE + LD a1, AO1, 0 * SIZE + LD a3, AO2, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD a4, AO2, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD a7, AO2, 2 * SIZE + LD a6, AO1, 3 * SIZE + LD a8, AO2, 3 * SIZE + addi.d I, I, -1 + bge $r0, I, .L13 + .align 3 +.L12: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + LD a3, AO2, 4 * SIZE + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD3 y3, a4, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a4, x1, y4 + LD a4, AO2, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + LD a7, AO2, 6 * SIZE + MADD3 y1, a6, x4, y1 + addi.d I, I, -1 + MADD4 y2, a6, x3, y2 + LD a6, AO1, 7 * SIZE + MADD3 y3, a8, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a8, x3, y4 + LD a8, AO2, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + LD a3, AO2, 8 * SIZE + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + LD a2, AO1, 9 * SIZE + MADD3 y3, a4, x2, y3 + LD x2, XX, 9 * SIZE + MADD4 y4, a4, x1, y4 + LD a4, AO2, 9 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 8 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 10 * SIZE + MADD1 y3, a7, x3, y3 + addi.d XX, XX, 8 * SIZE + MADD2 y4, a7, x4, y4 + LD a7, AO2, 10 * SIZE + MADD3 y1, a6, x4, y1 + addi.d AO2, AO2, 8 * SIZE + MADD4 y2, a6, x3, y2 + LD a6, AO1, 11 * SIZE + MADD3 y3, a8, x4, y3 + LD x4, XX, 3 * SIZE + MADD4 y4, a8, x3, y4 + LD a8, AO2, 3 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + LD a3, AO2, 4 * SIZE + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD3 y3, a4, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a4, x1, y4 + LD a4, AO2, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + LD a7, AO2, 6 * SIZE + MADD3 y1, a6, x4, y1 + MADD4 y2, a6, x3, y2 + LD a6, AO1, 7 * SIZE + MADD3 y3, a8, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a8, x3, y4 + LD a8, AO2, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + MADD3 y3, a4, x2, y3 + MADD4 y4, a4, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + MADD3 y1, a6, x4, y1 + addi.d XX, XX, 8 * SIZE + MADD4 y2, a6, x3, y2 + addi.d AO1, AO1, 8 * SIZE + MADD3 y3, a8, x4, y3 + addi.d AO2, AO2, 8 * SIZE + MADD4 y4, a8, x3, y4 + .align 3 + +.L15: + andi I, M, 2 + bge $r0, I, .L17 + LD x1, XX, 0 * SIZE + LD x2, XX, 1 * SIZE + LD x3, XX, 2 * SIZE + LD x4, XX, 3 * SIZE + LD a1, AO1, 0 * SIZE + LD a3, AO2, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD a4, AO2, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD a7, AO2, 2 * SIZE + LD a6, AO1, 3 * SIZE + LD a8, AO2, 3 * SIZE + MADD1 y1, a1, x1, y1 + MADD2 y2, a1, x2, y2 + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + MADD3 y3, a4, x2, y3 + MADD4 y4, a4, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + MADD3 y1, a6, x4, y1 + addi.d XX, XX, 4 * SIZE + MADD4 y2, a6, x3, y2 + addi.d AO1, AO1, 4 * SIZE + MADD3 y3, a8, x4, y3 + addi.d AO2, AO2, 4 * SIZE + MADD4 y4, a8, x3, y4 + .align 3 + +.L17: + andi I, M, 1 +.align 3 + + bge $r0, I, .L19 +.L18: + LD x1, XX, 0 * SIZE + LD x2, XX, 1 * SIZE + LD a1, AO1, 0 * SIZE + LD a3, AO2, 0 * SIZE + MADD1 y1, a1, x1, y1 + LD a2, AO1, 1 * SIZE + MADD2 y2, a1, x2, y2 + LD a4, AO2, 1 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + MADD3 y3, a4, x2, y3 + MADD4 y4, a4, x1, y4 + .align 3 + +.L19: + LD a1, Y, 0 * SIZE + LD a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + add.d Y, Y, INCY + MADD a1, y1, ALPHA_R, a1 + MADD a2, y1, ALPHA_I, a2 + MADD a3, y3, ALPHA_R, a3 + MADD a4, y3, ALPHA_I, a4 + NMSUB a1, y2, ALPHA_I, a1 + MADD a2, y2, ALPHA_R, a2 + NMSUB a3, y4, ALPHA_I, a3 + MTC y1, $r0 + MADD a4, y4, ALPHA_R, a4 + addi.d J, J, -1 + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + add.d YY, YY, INCY + ST a3, YY, 0 * SIZE + ST a4, YY, 1 * SIZE + add.d YY, YY, INCY + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + MOV y2, y1 + srai.d I, M, 2 + bge $r0, J, .L999 + MOV y3, y1 + move AO1, A + MOV y4, y1 + move XX, XORIG + bge $r0, I, .L25 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x4, XX, 3 * SIZE + addi.d I, I, -1 + LD a6, AO1, 3 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD3 y3, a2, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a2, x1, y4 + LD a2, AO1, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD3 y3, a6, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a6, x3, y4 + LD a6, AO1, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD3 y3, a2, x2, y3 + LD x2, XX, 9 * SIZE + MADD4 y4, a2, x1, y4 + LD a2, AO1, 9 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 8 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 10 * SIZE + MADD3 y3, a6, x4, y3 + LD x4, XX, 11 * SIZE + MADD4 y4, a6, x3, y4 + LD a6, AO1, 11 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD3 y3, a2, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a2, x1, y4 + LD a2, AO1, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD3 y3, a6, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a6, x3, y4 + LD a6, AO1, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + MADD3 y3, a2, x2, y3 + MADD4 y4, a2, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD3 y3, a6, x4, y3 + addi.d XX, XX, 8 * SIZE + MADD4 y4, a6, x3, y4 + addi.d AO1, AO1, 8 * SIZE + .align 3 + +.L25: + andi I, M, 2 + bge $r0, I, .L27 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a6, AO1, 3 * SIZE + MADD3 y3, a2, x2, y3 + LD x4, XX, 3 * SIZE + MADD4 y4, a2, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD3 y3, a6, x4, y3 + addi.d XX, XX, 4 * SIZE + MADD4 y4, a6, x3, y4 + addi.d AO1, AO1, 4 * SIZE + .align 3 + +.L27: + andi I, M, 1 +.align 3 + + bge $r0, I, .L29 +.L28: + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + MADD1 y1, a1, x1, y1 + MADD2 y2, a1, x2, y2 + MADD3 y3, a2, x2, y3 + MADD4 y4, a2, x1, y4 + .align 3 + +.L29: + LD a1, Y, 0 * SIZE + LD a2, Y, 1 * SIZE + ADD y1, y1, y3 + ADD y2, y2, y4 + MADD a1, y1, ALPHA_R, a1 + MADD a2, y1, ALPHA_I, a2 + NMSUB a1, y2, ALPHA_I, a1 + MADD a2, y2, ALPHA_R, a2 + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 +#ifndef __64BIT__ + fld.d $f18, $sp, 16 + fld.d $f19, $sp, 24 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 16 +#else + addi.d $sp, $sp, 32 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S new file mode 100644 index 000000000..49f640268 --- /dev/null +++ b/kernel/loongarch64/znrm2.S @@ -0,0 +1,304 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r7 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define ALPHA $f4 +#define max $f5 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + move XX, X + MOV s2, s1 + srai.d I, N, 2 + MOV s3, s1 + MOV s4, s1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + NOP + FABS t3, a3 + LD a2, X, 1 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + NOP + CMPLT $fcc2, s3, t3 + LD a4, X, 1 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + NOP + FABS t3, a7 + LD a6, X, 1 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + NOP + CMPLT $fcc2, s3, t3 + LD a8, X, 1 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L100 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + FABS t1, a1 + FABS t2, a2 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L100: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + lu12i.w TEMP, 0x3f800 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, s1, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, s1 + MOV max, s1 + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 + MOV s4, a1 + srai.d I, N, 2 + bge $r0, I, .L105 + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, XX, 0 * SIZE + LD a4, XX, 1 * SIZE + add.d XX, XX, INCX + LD a5, XX, 0 * SIZE + LD a6, XX, 1 * SIZE + add.d XX, XX, INCX + LD a7, XX, 0 * SIZE + LD a8, XX, 1 * SIZE + addi.d I, I, -1 + add.d XX, XX, INCX + bge $r0, I, .L104 + .align 3 + +.L103: + MUL t1, ALPHA, a1 + LD a1, XX, 0 * SIZE + MUL t2, ALPHA, a2 + addi.d I, I, -1 + MUL t3, ALPHA, a3 + LD a2, XX, 1 * SIZE + MUL t4, ALPHA, a4 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a3, XX, 0 * SIZE + MADD s2, t2, t2, s2 + NOP + MADD s3, t3, t3, s3 + LD a4, XX, 1 * SIZE + MADD s4, t4, t4, s4 + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + LD a5, XX, 0 * SIZE + MUL t2, ALPHA, a6 + NOP + MUL t3, ALPHA, a7 + LD a6, XX, 1 * SIZE + MUL t4, ALPHA, a8 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a7, XX, 0 * SIZE + MADD s2, t2, t2, s2 + LD a8, XX, 1 * SIZE + MADD s3, t3, t3, s3 + add.d XX, XX, INCX + MADD s4, t4, t4, s4 + blt $r0, I, .L103 + .align 3 + +.L104: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + .align 3 + +.L105: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L106: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + addi.d I, I, -1 + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MADD s1, t1, t1, s1 + add.d XX, XX, INCX + MADD s2, t2, t2, s2 + blt $r0, I, .L106 + .align 3 + +.L998: + ADD s1, s1, s2 + ADD s3, s3, s4 + ADD s1, s1, s3 + fsqrt.d s1, s1 + move $r4, $r17 + MUL $f0, max, s1 + jirl $r0, $r1, 0x0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S new file mode 100644 index 000000000..fe53ed713 --- /dev/null +++ b/kernel/loongarch64/zscal.S @@ -0,0 +1,356 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r7 +#define INCX $r8 +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define t1 $f14 +#define t2 $f15 +#define t3 $f16 +#define t4 $f17 + + PROLOGUE + + li TEMP, 2 * SIZE + MTC a1, $r0 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, N, .L999 + CMPEQ $fcc0, ALPHA_R, a1 + CMPEQ $fcc1, ALPHA_I, a1 + bceqz $fcc0, .L50 + bceqz $fcc1, .L50 + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + .align 3 + +.L12: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + ST a1, X, 2 * SIZE + ST a1, X, 3 * SIZE + ST a1, X, 4 * SIZE + ST a1, X, 5 * SIZE + ST a1, X, 6 * SIZE + ST a1, X, 7 * SIZE + addi.w I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L16: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L16 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: + srai.d I, N, 2 + bge $r0, I, .L25 + .align 3 + +.L22: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L26: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + ST a1, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L26 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L50: + srai.d I, N, 2 + bne INCX, TEMP, .L60 + addi.d I, I, -1 + blt I, $r0, .L55 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + MUL t1, ALPHA_R, a1 + LD a7, X, 6 * SIZE + MUL t2, ALPHA_I, a1 + LD a8, X, 7 * SIZE + MUL t3, ALPHA_R, a3 + MUL t4, ALPHA_I, a3 + bge $r0, I, .L53 + .align 3 + +.L52: + NMSUB t1, a2, ALPHA_I, t1 + LD a1, X, 8 * SIZE + MADD t2, a2, ALPHA_R, t2 + LD a2, X, 9 * SIZE + NMSUB t3, a4, ALPHA_I, t3 + LD a3, X, 10 * SIZE + MADD t4, a4, ALPHA_R, t4 + LD a4, X, 11 * SIZE + ST t1, X, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, X, 1 * SIZE + MUL t2, ALPHA_I, a5 + ST t3, X, 2 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, X, 3 * SIZE + MUL t4, ALPHA_I, a7 + NMSUB t1, a6, ALPHA_I, t1 + LD a5, X, 12 * SIZE + MADD t2, a6, ALPHA_R, t2 + LD a6, X, 13 * SIZE + NMSUB t3, a8, ALPHA_I, t3 + LD a7, X, 14 * SIZE + MADD t4, a8, ALPHA_R, t4 + LD a8, X, 15 * SIZE + ST t1, X, 4 * SIZE + MUL t1, ALPHA_R, a1 + ST t2, X, 5 * SIZE + MUL t2, ALPHA_I, a1 + ST t3, X, 6 * SIZE + MUL t3, ALPHA_R, a3 + ST t4, X, 7 * SIZE + MUL t4, ALPHA_I, a3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L52 + .align 3 + +.L53: + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + NMSUB t3, a4, ALPHA_I, t3 + MADD t4, a4, ALPHA_R, t4 + ST t1, X, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, X, 1 * SIZE + MUL t2, ALPHA_I, a5 + ST t3, X, 2 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, X, 3 * SIZE + MUL t4, ALPHA_I, a7 + NMSUB t1, a6, ALPHA_I, t1 + MADD t2, a6, ALPHA_R, t2 + NMSUB t3, a8, ALPHA_I, t3 + MADD t4, a8, ALPHA_R, t4 + ST t1, X, 4 * SIZE + ST t2, X, 5 * SIZE + ST t3, X, 6 * SIZE + ST t4, X, 7 * SIZE + addi.d X, X, 8 * SIZE + .align 3 + +.L55: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L56: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + MUL t1, ALPHA_R, a1 + MUL t2, ALPHA_I, a1 + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + addi.d X, X, 2 * SIZE + addi.d I, I, -1 + ST t1, X, -2 * SIZE + ST t2, X, -1 * SIZE + blt $r0, I, .L56 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L60: + srai.d I, N, 2 + move XX, X + addi.d I, I, -1 + blt I, $r0, .L65 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + MUL t1, ALPHA_R, a1 + LD a7, X, 0 * SIZE + MUL t2, ALPHA_I, a1 + LD a8, X, 1 * SIZE + MUL t3, ALPHA_R, a3 + add.d X, X, INCX + MUL t4, ALPHA_I, a3 + bge $r0, I, .L63 + .align 3 + +.L62: + NMSUB t1, a2, ALPHA_I, t1 + LD a1, X, 0 * SIZE + MADD t2, a2, ALPHA_R, t2 + LD a2, X, 1 * SIZE + add.d X, X, INCX + NMSUB t3, a4, ALPHA_I, t3 + LD a3, X, 0 * SIZE + MADD t4, a4, ALPHA_R, t4 + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, XX, 1 * SIZE + MUL t2, ALPHA_I, a5 + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, XX, 1 * SIZE + MUL t4, ALPHA_I, a7 + add.d XX, XX, INCX + NMSUB t1, a6, ALPHA_I, t1 + LD a5, X, 0 * SIZE + MADD t2, a6, ALPHA_R, t2 + LD a6, X, 1 * SIZE + add.d X, X, INCX + NMSUB t3, a8, ALPHA_I, t3 + LD a7, X, 0 * SIZE + MADD t4, a8, ALPHA_R, t4 + LD a8, X, 1 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + MUL t1, ALPHA_R, a1 + ST t2, XX, 1 * SIZE + MUL t2, ALPHA_I, a1 + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + MUL t3, ALPHA_R, a3 + ST t4, XX, 1 * SIZE + MUL t4, ALPHA_I, a3 + addi.d I, I, -1 + add.d XX, XX, INCX + blt $r0, I, .L62 + .align 3 + +.L63: + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + NMSUB t3, a4, ALPHA_I, t3 + MADD t4, a4, ALPHA_R, t4 + ST t1, XX, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, XX, 1 * SIZE + MUL t2, ALPHA_I, a5 + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, XX, 1 * SIZE + MUL t4, ALPHA_I, a7 + add.d XX, XX, INCX + NMSUB t1, a6, ALPHA_I, t1 + MADD t2, a6, ALPHA_R, t2 + NMSUB t3, a8, ALPHA_I, t3 + MADD t4, a8, ALPHA_R, t4 + ST t1, XX, 0 * SIZE + ST t2, XX, 1 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + ST t4, XX, 1 * SIZE + add.d XX, XX, INCX + .align 3 + +.L65: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L66: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + MUL t1, ALPHA_R, a1 + MUL t2, ALPHA_I, a1 + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + addi.d I, I, -1 + ST t1, X, 0 * SIZE + ST t2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L66 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/ztrsm_kernel_LT.S b/kernel/loongarch64/ztrsm_kernel_LT.S new file mode 100644 index 000000000..26b1230b8 --- /dev/null +++ b/kernel/loongarch64/ztrsm_kernel_LT.S @@ -0,0 +1,1344 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r25 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define KK $r26 +#define TEMP $r27 +#define AORIG $r28 +#define a1 $f22 +#define a2 $f8 +#define a3 $f26 +#define a4 $f27 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f0 +#define c22 $f1 +#define c31 $f2 +#define c32 $f3 +#define c41 $f4 +#define c42 $f5 +#define c51 $f6 +#define c52 $f7 +#define c61 $f18 +#define c62 $f19 +#define c71 $f20 +#define c72 $f21 +#define c81 $f24 +#define c82 $f25 + +#ifndef CONJ +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#define MADD5 MSUB +#define MADD6 MADD +#define MADD7 NMSUB +#define MADD8 MADD +#else +#if defined(LN) || defined(LT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#else +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#define MADD5 MADD +#define MADD6 MSUB +#define MADD7 MADD +#define MADD8 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 +#ifndef __64BIT__ + fst.d $f18, $sp, 88 + fst.d $f19, $sp, 96 + fst.d $f20, $sp, 104 + fst.d $f21, $sp, 112 +#endif + slli.d LDC, LDC, ZBASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, ZBASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + srai.d J, N, 2 +nop + bge $r0, J, .L20 +.L10: +#ifdef RT + slli.d TEMP, K, 2 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + move I, M +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c61, c11 + bge $r0, I, .L19 + .align 3 + +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + srai.d L, TEMP, 2 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + bge $r0, L, .L15 +#endif + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + addi.d L, L, -1 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + addi.d L, L, -1 + MADD3 c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD3 c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + ADD c51, c51, c62 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 + MUL a1, b2, c52 + MUL a2, b2, c51 + MUL a3, b2, c72 + MUL a4, b2, c71 + MADD5 c51, c51, b1, a1 + MADD6 c52, c52, b1, a2 + MADD5 c71, c71, b1, a3 + MADD6 c72, c72, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + NMSUB c51, c11, b5, c51 + MADD7 c52, c11, b6, c52 + NMSUB c71, c11, b7, c71 + MADD7 c72, c11, b8, c72 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + MADD8 c51, c12, b6, c51 + NMSUB c52, c12, b5, c52 + MADD8 c71, c12, b8, c71 + NMSUB c72, c12, b7, c72 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 + NMSUB c51, c31, b5, c51 + MADD7 c52, c31, b6, c52 + NMSUB c71, c31, b7, c71 + MADD7 c72, c31, b8, c72 + MADD8 c51, c32, b6, c51 + NMSUB c52, c32, b5, c52 + MADD8 c71, c32, b8, c71 + NMSUB c72, c32, b7, c72 + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL a1, b6, c52 + MUL a2, b6, c51 + MADD5 c51, c51, b5, a1 + MADD6 c52, c52, b5, a2 + NMSUB c71, c51, b7, c71 + MADD7 c72, c51, b8, c72 + MADD8 c71, c52, b8, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL a1, b8, c72 + MUL a2, b8, c71 + MADD5 c71, c71, b7, a1 + MADD6 c72, c72, b7, a2 +#endif +#ifdef RT + LD b1, BO, 30 * SIZE + LD b2, BO, 31 * SIZE + LD b3, BO, 28 * SIZE + LD b4, BO, 29 * SIZE + LD b5, BO, 26 * SIZE + LD b6, BO, 27 * SIZE + LD b7, BO, 24 * SIZE + LD b8, BO, 25 * SIZE + MUL a1, b2, c72 + MUL a2, b2, c71 + MADD5 c71, c71, b1, a1 + MADD6 c72, c72, b1, a2 + NMSUB c51, c71, b3, c51 + MADD7 c52, c71, b4, c52 + NMSUB c31, c71, b5, c31 + MADD7 c32, c71, b6, c32 + NMSUB c11, c71, b7, c11 + MADD7 c12, c71, b8, c12 + MADD8 c51, c72, b4, c51 + NMSUB c52, c72, b3, c52 + MADD8 c31, c72, b6, c31 + NMSUB c32, c72, b5, c32 + MADD8 c11, c72, b8, c11 + NMSUB c12, c72, b7, c12 + LD b3, BO, 20 * SIZE + LD b4, BO, 21 * SIZE + LD b5, BO, 18 * SIZE + LD b6, BO, 19 * SIZE + LD b7, BO, 16 * SIZE + LD b8, BO, 17 * SIZE + MUL a1, b4, c52 + MUL a2, b4, c51 + MADD5 c51, c51, b3, a1 + MADD6 c52, c52, b3, a2 + NMSUB c31, c51, b5, c31 + MADD7 c32, c51, b6, c32 + NMSUB c11, c51, b7, c11 + MADD7 c12, c51, b8, c12 + MADD8 c31, c52, b6, c31 + NMSUB c32, c52, b5, c32 + MADD8 c11, c52, b8, c11 + NMSUB c12, c52, b7, c12 + LD b5, BO, 10 * SIZE + LD b6, BO, 11 * SIZE + LD b7, BO, 8 * SIZE + LD b8, BO, 9 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c52, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c72, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c52, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c72, AO, 7 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE + addi.d CO3,CO3, -2 * SIZE + addi.d CO4,CO4, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE + ST c51, CO3, 0 * SIZE + ST c52, CO3, 1 * SIZE + ST c71, CO4, 0 * SIZE + ST c72, CO4, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE + addi.d CO3,CO3, 2 * SIZE + addi.d CO4,CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif +MTC c11, $r0 + addi.d I, I, -1 + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L19: +#ifdef LN + slli.d TEMP, K, 2 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + blt $r0, J, .L10 + .align 3 + +.L20: + andi J, N, 2 + bge $r0, J, .L30 +#ifdef RT + slli.d TEMP, K, 1 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif +MTC c11, $r0 + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + move I, M + bge $r0, I, .L29 + .align 3 + +.L21: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + srai.d L, KK, 2 + LD b3, B, 2 * SIZE + MOV c12, c11 + LD b4, B, 3 * SIZE + MOV c22, c11 + LD b5, B, 4 * SIZE + MOV c32, c11 + MOV c42, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + srai.d L, TEMP, 2 + LD b3, BO, 2 * SIZE + MOV c12, c11 + LD b4, BO, 3 * SIZE + MOV c22, c11 + LD b5, BO, 4 * SIZE + MOV c32, c11 +MOV c42, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 12 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c11, b5, a3, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 17 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 18 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 19 * SIZE +addi.d BO, BO, 16 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + addi.d BO, BO, 4 * SIZE + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 0 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 3 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L26 +.L28: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 +#endif +#ifdef RT + LD b5, BO, 6 * SIZE + LD b6, BO, 7 * SIZE + LD b7, BO, 4 * SIZE + LD b8, BO, 5 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 1 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L30: + andi J, N, 1 + bge $r0, J, .L999 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif +MTC c11, $r0 + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + move I, M + bge $r0, I, .L39 + .align 3 + +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + MOV c12, c11 + srai.d L, KK, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, B, 4 * SIZE + MOV c42, c11 +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + MOV c12, c11 + srai.d L, TEMP, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, BO, 4 * SIZE +MOV c42, c11 + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD1 c11, b1, a1, c11 + LD b4, BO, 3 * SIZE + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD b2, BO, 5 * SIZE + MADD3 c21, b4, a1, c21 + LD a1, AO, 8 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 5 * SIZE + MADD1 c11, b3, a3, c11 + LD b4, BO, 7 * SIZE + MADD3 c21, b2, a3, c21 + LD a3, AO, 6 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 6 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 7 * SIZE + MADD1 c11, b3, a3, c11 + LD b2, BO, 9 * SIZE + MADD3 c21, b4, a3, c21 + LD a3, AO, 12 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 12 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 9 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD1 c11, b1, a1, c11 + addi.d L, L, -1 + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + LD b2, BO, 3 * SIZE + addi.d BO, BO, 2 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L36 +.L38: + ADD c11, c11, c22 + ADD c12, c12, c21 +#if defined(LN) || defined(RT) + addi.d TEMP, KK, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L31 + .align 3 + +.L39: +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 +#ifndef __64BIT__ + fld.d $f18, $sp, 88 + fld.d $f19, $sp, 96 + fld.d $f20, $sp, 104 + fld.d $f21, $sp, 112 +#endif + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/ztrsm_kernel_RT.S b/kernel/loongarch64/ztrsm_kernel_RT.S new file mode 100644 index 000000000..e9f04362d --- /dev/null +++ b/kernel/loongarch64/ztrsm_kernel_RT.S @@ -0,0 +1,1343 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r25 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define KK $r26 +#define TEMP $r27 +#define AORIG $r28 +#define a1 $f22 +#define a2 $f8 +#define a3 $f26 +#define a4 $f27 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f0 +#define c22 $f1 +#define c31 $f2 +#define c32 $f3 +#define c41 $f4 +#define c42 $f5 +#define c51 $f6 +#define c52 $f7 +#define c61 $f18 +#define c62 $f19 +#define c71 $f20 +#define c72 $f21 +#define c81 $f24 +#define c82 $f25 + +#ifndef CONJ +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#define MADD5 MSUB +#define MADD6 MADD +#define MADD7 NMSUB +#define MADD8 MADD +#else +#if defined(LN) || defined(LT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#else +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#define MADD5 MADD +#define MADD6 MSUB +#define MADD7 MADD +#define MADD8 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 +#ifndef __64BIT__ + fst.d $f18, $sp, 88 + fst.d $f19, $sp, 96 + fst.d $f20, $sp, 104 + fst.d $f21, $sp, 112 +#endif + slli.d LDC, LDC, ZBASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, ZBASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + andi J, N, 1 + bge $r0, J, .L20 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif +MTC c11, $r0 + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + move I, M + bge $r0, I, .L39 + .align 3 + +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + MOV c12, c11 + srai.d L, KK, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, B, 4 * SIZE + MOV c42, c11 +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + MOV c12, c11 + srai.d L, TEMP, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, BO, 4 * SIZE +MOV c42, c11 + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD1 c11, b1, a1, c11 + LD b4, BO, 3 * SIZE + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD b2, BO, 5 * SIZE + MADD3 c21, b4, a1, c21 + LD a1, AO, 8 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 5 * SIZE + MADD1 c11, b3, a3, c11 + LD b4, BO, 7 * SIZE + MADD3 c21, b2, a3, c21 + LD a3, AO, 6 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 6 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 7 * SIZE + MADD1 c11, b3, a3, c11 + LD b2, BO, 9 * SIZE + MADD3 c21, b4, a3, c21 + LD a3, AO, 12 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 12 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 9 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD1 c11, b1, a1, c11 + addi.d L, L, -1 + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + LD b2, BO, 3 * SIZE + addi.d BO, BO, 2 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L36 +.L38: + ADD c11, c11, c22 + ADD c12, c12, c21 +#if defined(LN) || defined(RT) + addi.d TEMP, KK, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L31 + .align 3 + +.L39: +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L20: + andi J, N, 2 + bge $r0, J, .L30 +#ifdef RT + slli.d TEMP, K, 1 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif +MTC c11, $r0 + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + move I, M + bge $r0, I, .L29 + .align 3 + +.L21: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + srai.d L, KK, 2 + LD b3, B, 2 * SIZE + MOV c12, c11 + LD b4, B, 3 * SIZE + MOV c22, c11 + LD b5, B, 4 * SIZE + MOV c32, c11 + MOV c42, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + srai.d L, TEMP, 2 + LD b3, BO, 2 * SIZE + MOV c12, c11 + LD b4, BO, 3 * SIZE + MOV c22, c11 + LD b5, BO, 4 * SIZE + MOV c32, c11 +MOV c42, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 12 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c11, b5, a3, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 17 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 18 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 19 * SIZE +addi.d BO, BO, 16 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + addi.d BO, BO, 4 * SIZE + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 0 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 3 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L26 +.L28: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 +#endif +#ifdef RT + LD b5, BO, 6 * SIZE + LD b6, BO, 7 * SIZE + LD b7, BO, 4 * SIZE + LD b8, BO, 5 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 1 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L30: + srai.d J, N, 2 +nop + bge $r0, J, .L999 +.L10: +#ifdef RT + slli.d TEMP, K, 2 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + move I, M +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c61, c11 + bge $r0, I, .L19 + .align 3 + +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + srai.d L, TEMP, 2 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + bge $r0, L, .L15 +#endif + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + addi.d L, L, -1 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + addi.d L, L, -1 + MADD3 c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD3 c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + ADD c51, c51, c62 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 + MUL a1, b2, c52 + MUL a2, b2, c51 + MUL a3, b2, c72 + MUL a4, b2, c71 + MADD5 c51, c51, b1, a1 + MADD6 c52, c52, b1, a2 + MADD5 c71, c71, b1, a3 + MADD6 c72, c72, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + NMSUB c51, c11, b5, c51 + MADD7 c52, c11, b6, c52 + NMSUB c71, c11, b7, c71 + MADD7 c72, c11, b8, c72 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + MADD8 c51, c12, b6, c51 + NMSUB c52, c12, b5, c52 + MADD8 c71, c12, b8, c71 + NMSUB c72, c12, b7, c72 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 + NMSUB c51, c31, b5, c51 + MADD7 c52, c31, b6, c52 + NMSUB c71, c31, b7, c71 + MADD7 c72, c31, b8, c72 + MADD8 c51, c32, b6, c51 + NMSUB c52, c32, b5, c52 + MADD8 c71, c32, b8, c71 + NMSUB c72, c32, b7, c72 + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL a1, b6, c52 + MUL a2, b6, c51 + MADD5 c51, c51, b5, a1 + MADD6 c52, c52, b5, a2 + NMSUB c71, c51, b7, c71 + MADD7 c72, c51, b8, c72 + MADD8 c71, c52, b8, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL a1, b8, c72 + MUL a2, b8, c71 + MADD5 c71, c71, b7, a1 + MADD6 c72, c72, b7, a2 +#endif +#ifdef RT + LD b1, BO, 30 * SIZE + LD b2, BO, 31 * SIZE + LD b3, BO, 28 * SIZE + LD b4, BO, 29 * SIZE + LD b5, BO, 26 * SIZE + LD b6, BO, 27 * SIZE + LD b7, BO, 24 * SIZE + LD b8, BO, 25 * SIZE + MUL a1, b2, c72 + MUL a2, b2, c71 + MADD5 c71, c71, b1, a1 + MADD6 c72, c72, b1, a2 + NMSUB c51, c71, b3, c51 + MADD7 c52, c71, b4, c52 + NMSUB c31, c71, b5, c31 + MADD7 c32, c71, b6, c32 + NMSUB c11, c71, b7, c11 + MADD7 c12, c71, b8, c12 + MADD8 c51, c72, b4, c51 + NMSUB c52, c72, b3, c52 + MADD8 c31, c72, b6, c31 + NMSUB c32, c72, b5, c32 + MADD8 c11, c72, b8, c11 + NMSUB c12, c72, b7, c12 + LD b3, BO, 20 * SIZE + LD b4, BO, 21 * SIZE + LD b5, BO, 18 * SIZE + LD b6, BO, 19 * SIZE + LD b7, BO, 16 * SIZE + LD b8, BO, 17 * SIZE + MUL a1, b4, c52 + MUL a2, b4, c51 + MADD5 c51, c51, b3, a1 + MADD6 c52, c52, b3, a2 + NMSUB c31, c51, b5, c31 + MADD7 c32, c51, b6, c32 + NMSUB c11, c51, b7, c11 + MADD7 c12, c51, b8, c12 + MADD8 c31, c52, b6, c31 + NMSUB c32, c52, b5, c32 + MADD8 c11, c52, b8, c11 + NMSUB c12, c52, b7, c12 + LD b5, BO, 10 * SIZE + LD b6, BO, 11 * SIZE + LD b7, BO, 8 * SIZE + LD b8, BO, 9 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c52, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c72, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c52, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c72, AO, 7 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE + addi.d CO3,CO3, -2 * SIZE + addi.d CO4,CO4, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE + ST c51, CO3, 0 * SIZE + ST c52, CO3, 1 * SIZE + ST c71, CO4, 0 * SIZE + ST c72, CO4, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE + addi.d CO3,CO3, 2 * SIZE + addi.d CO4,CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif +MTC c11, $r0 + addi.d I, I, -1 + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L19: +#ifdef LN + slli.d TEMP, K, 2 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + blt $r0, J, .L10 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 +#ifndef __64BIT__ + fld.d $f18, $sp, 88 + fld.d $f19, $sp, 96 + fld.d $f20, $sp, 104 + fld.d $f21, $sp, 112 +#endif + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile new file mode 100644 index 000000000..71e5a87cb --- /dev/null +++ b/lapack/laswp/loongarch64/Makefile @@ -0,0 +1,12 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile diff --git a/param.h b/param.h index 965b97466..634e0ef5d 100644 --- a/param.h +++ b/param.h @@ -2691,6 +2691,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#if defined (LOONGSON3R5) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define QGEMM_DEFAULT_P qgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p +#define XGEMM_DEFAULT_P xgemm_p + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 128 +#define XGEMM_DEFAULT_Q 128 + +#define SYMV_P 16 +#endif + #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 From 4d7dfe4845078dbe57afed8bb4181451d8cd3734 Mon Sep 17 00:00:00 2001 From: Craig Watson Date: Tue, 27 Jul 2021 09:00:30 +0000 Subject: [PATCH 012/143] Include Haiku in processor count checks --- driver/others/memory.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 6e654ccf2..39ed264e8 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -428,7 +428,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) int max_num; #endif int blas_goto_num = 0; @@ -436,7 +436,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) max_num = get_num_procs(); #endif @@ -460,7 +460,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -1979,7 +1979,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) int max_num; #endif int blas_goto_num = 0; @@ -1987,7 +1987,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) max_num = get_num_procs(); #endif @@ -2011,7 +2011,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif From 02d4a49761f2ed74e0fe6943c3a3759ebed45ea3 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Thu, 15 Jul 2021 04:54:33 -0500 Subject: [PATCH 013/143] Also make sure the `1` is INTEGER*4 for OMP_SET_NUM_THREADS --- lapack-netlib/TESTING/EIG/cchkee.F | 8 +++++--- lapack-netlib/TESTING/EIG/dchkee.F | 5 +++-- lapack-netlib/TESTING/EIG/schkee.F | 5 +++-- lapack-netlib/TESTING/EIG/zchkee.F | 8 +++++--- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cchkee.F b/lapack-netlib/TESTING/EIG/cchkee.F index ab54078a3..ef9f71ec9 100644 --- a/lapack-netlib/TESTING/EIG/cchkee.F +++ b/lapack-netlib/TESTING/EIG/cchkee.F @@ -1076,7 +1076,7 @@ INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - INTEGER*4 N_THREADS + INTEGER*4 N_THREADS, ONE_THREAD REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1873,7 +1873,8 @@ IF( TSTERR ) THEN #if defined(_OPENMP) N_THREADS = OMP_GET_MAX_THREADS() - CALL OMP_SET_NUM_THREADS(1) + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) #endif CALL CERRST( 'CST', NOUT ) #if defined(_OPENMP) @@ -2340,7 +2341,8 @@ IF( TSTERR ) THEN #if defined(_OPENMP) N_THREADS = OMP_GET_MAX_THREADS() - CALL OMP_SET_NUM_THREADS(1) + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) #endif CALL CERRST( 'CHB', NOUT ) #if defined(_OPENMP) diff --git a/lapack-netlib/TESTING/EIG/dchkee.F b/lapack-netlib/TESTING/EIG/dchkee.F index 6399fecef..89b6958fe 100644 --- a/lapack-netlib/TESTING/EIG/dchkee.F +++ b/lapack-netlib/TESTING/EIG/dchkee.F @@ -1082,7 +1082,7 @@ INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - INTEGER*4 N_THREADS + INTEGER*4 N_THREADS, ONE_THREAD DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1878,7 +1878,8 @@ IF( TSTERR ) THEN #if defined(_OPENMP) N_THREADS = OMP_GET_MAX_THREADS() - CALL OMP_SET_NUM_THREADS(1) + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) #endif CALL DERRST( 'DST', NOUT ) #if defined(_OPENMP) diff --git a/lapack-netlib/TESTING/EIG/schkee.F b/lapack-netlib/TESTING/EIG/schkee.F index 5484a7c26..b58433959 100644 --- a/lapack-netlib/TESTING/EIG/schkee.F +++ b/lapack-netlib/TESTING/EIG/schkee.F @@ -1082,7 +1082,7 @@ INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - INTEGER*4 N_THREADS + INTEGER*4 N_THREADS, ONE_THREAD REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1879,7 +1879,8 @@ IF( TSTERR ) THEN #if defined(_OPENMP) N_THREADS = OMP_GET_MAX_THREADS() - CALL OMP_SET_NUM_THREADS(1) + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) #endif CALL SERRST( 'SST', NOUT ) #if defined(_OPENMP) diff --git a/lapack-netlib/TESTING/EIG/zchkee.F b/lapack-netlib/TESTING/EIG/zchkee.F index 7e9144d15..fb418a43b 100644 --- a/lapack-netlib/TESTING/EIG/zchkee.F +++ b/lapack-netlib/TESTING/EIG/zchkee.F @@ -1076,7 +1076,7 @@ INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - INTEGER*4 N_THREADS + INTEGER*4 N_THREADS, ONE_THREAD DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1873,7 +1873,8 @@ IF( TSTERR ) THEN #if defined(_OPENMP) N_THREADS = OMP_GET_MAX_THREADS() - CALL OMP_SET_NUM_THREADS(1) + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) #endif CALL ZERRST( 'ZST', NOUT ) #if defined(_OPENMP) @@ -2338,7 +2339,8 @@ IF( TSTERR ) THEN #if defined(_OPENMP) N_THREADS = OMP_GET_MAX_THREADS() - CALL OMP_SET_NUM_THREADS(1) + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) #endif CALL ZERRST( 'ZHB', NOUT ) #if defined(_OPENMP) From 34207bdf5b91373c08fbebf038b43e5b8c9ed7cf Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 30 Jul 2021 18:11:12 +0800 Subject: [PATCH 014/143] Fixed typos about LOONGARCH64 --- Makefile.system | 2 +- common_loongarch64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 4084390db..13c946ba1 100644 --- a/Makefile.system +++ b/Makefile.system @@ -856,7 +856,7 @@ BINARY_DEFINED = 1 endif ifeq ($(ARCH), loongarch64) -ifeq ($(CORE), LOONGSONG3R5) +ifeq ($(CORE), LOONGSON3R5) CCOMMON_OPT += -march=loongarch64 -mabi=lp64 FCOMMON_OPT += -march=loongarch64 -mabi=lp64 endif diff --git a/common_loongarch64.h b/common_loongarch64.h index 959e7e58a..e15539b5f 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -186,7 +186,7 @@ REALNAME: ;\ #define BUFFER_SIZE ( 32 << 20) -#define PAGESIZE (16UL << 1) +#define PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10) #define HUGE_PAGESIZE ( 2 << 20) From cbc41973fde6137bc42c34de64a41b5a82b597c0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Jul 2021 14:20:12 +0200 Subject: [PATCH 015/143] Disable gfortran tree vectorizer to avoid gcc11+ miscompilation at O3 --- ctest/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 17f29fe69..f785d3f90 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -4,6 +4,9 @@ include_directories(${PROJECT_BINARY_DIR}) enable_language(Fortran) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") +if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") +endif() if(WIN32) FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 From b4f4ed378b2343b0af8b1235838feef4f6c8c51c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Jul 2021 14:21:08 +0200 Subject: [PATCH 016/143] Disable gfortran tree vectorizer to avoid gcc11+ miscompilation at O3 --- test/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d338242ff..e4ee8b28b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,6 +2,10 @@ include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_BINARY_DIR}) enable_language(Fortran) +if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") +endif() + if (BUILD_SINGLE) list( APPEND OpenBLAS_Tests sblat1 sblat2 sblat3) From e78fbe46541dedcf39eb0362e69b1de6f7808642 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Jul 2021 14:44:54 +0200 Subject: [PATCH 017/143] Disable gfortran tree vectorizer to avoid gcc11+ miscompilation at O3 --- ctest/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ctest/Makefile b/ctest/Makefile index 15c83a907..c5e1094da 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -6,6 +6,9 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system override CFLAGS += -DADD$(BU) -DCBLAS +ifeq ($(F_COMPILER),GFORTRAN) + override FFLAGS += -fno-tree-vectorize +endif override TARGET_ARCH= override TARGET_MACH= From 5dc6aa74f05cc6c4405be195461fa5afc2c03888 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Jul 2021 14:46:19 +0200 Subject: [PATCH 018/143] Disable gfortran tree vectorizer to avoid gcc11+ miscompilation at O3 --- test/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/Makefile b/test/Makefile index 6c5f041c2..923f1537c 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,6 +1,8 @@ TOPDIR = .. include ../Makefile.system - +ifeq ($(F_COMPILER),GFORTRAN) + override FFLAGS += -fno-tree-vectorize +endif ifeq ($(NOFORTRAN),1) all :: From f2a7a67f5afa31e1e8839e5a386773e45bb5a687 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 31 Jul 2021 17:23:40 +0200 Subject: [PATCH 019/143] Improve the "tried to allocate too many buffers" error message --- driver/others/memory.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 39ed264e8..f0521ab2d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -222,11 +222,11 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; - -#if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; int ret; + +#if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) @@ -428,7 +428,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -436,7 +436,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -460,7 +460,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock); func = &memoryalloc[0]; - while ((*func != NULL) && (map_address == (void *) -1)) { + while ((func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); @@ -1619,12 +1619,10 @@ static int on_process_term(void) #else #pragma data_seg(".CRT$XLB") #endif - +static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #ifdef _WIN64 -static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma const_seg() #else -static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma data_seg() #endif @@ -1633,12 +1631,10 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI #else #pragma data_seg(".CRT$XTU") #endif - +static int(*p_process_term)(void) = on_process_term; #ifdef _WIN64 -static const int(*p_process_term)(void) = on_process_term; #pragma const_seg() #else -static int(*p_process_term)(void) = on_process_term; #pragma data_seg() #endif #endif @@ -1672,23 +1668,16 @@ void gotoblas_dummy_for_PGI(void) { #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 #endif -#elif !defined(OS_EMBEDDED) -#define ALLOC_MMAP -#define ALLOC_MALLOC #else +#define ALLOC_MMAP #define ALLOC_MALLOC - -inline int puts(const char *str) { return 0; } -inline int printf(const char *format, ...) { return 0; } -inline char *getenv(const char *name) { return ""; } -inline int atoi(const char *str) { return 0; } #endif #include #include #include -#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED) +#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) #include #ifndef NO_SYSV_IPC #include @@ -1702,6 +1691,7 @@ inline int atoi(const char *str) { return 0; } #include #include #include +#include #include #include #include @@ -1979,7 +1969,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -1987,7 +1977,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -2011,7 +2001,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif From 0b8f7c8c10957aa1d7836cb8ae55337d180d5a75 Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 2 Aug 2021 10:00:41 +0800 Subject: [PATCH 020/143] Add cmake support for LOONGARCH64 --- cmake/arch.cmake | 4 ++++ cmake/cc.cmake | 9 +++++++++ cmake/fc.cmake | 7 +++++++ cmake/system_check.cmake | 4 +++- kernel/loongarch64/KERNEL | 2 ++ 5 files changed, 25 insertions(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 154e59db6..57ee5a4fb 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -113,6 +113,10 @@ if (MIPS64) set(NO_BINARY_MODE 1) endif () +if (LOONGARCH64) + set(NO_BINARY_MODE 1) +endif () + if (${ARCH} STREQUAL "alpha") set(NO_BINARY_MODE 1) set(BINARY_DEFINED 1) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index ac5e455d5..1794b5e5b 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -29,6 +29,15 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") endif () + if (LOONGARCH64) + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + endif () + set(BINARY_DEFINED 1) + endif () + if (CMAKE_SYSTEM_NAME STREQUAL "AIX") set(BINARY_DEFINED 1) endif () diff --git a/cmake/fc.cmake b/cmake/fc.cmake index fc1f9bb22..631664569 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -61,6 +61,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") endif () endif () + if (LOONGARCH64) + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + endif () + endif () else () if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index fdc79c8ce..8d0558c0e 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -38,6 +38,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") set(PPC 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") + set(LOONGARCH64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") if (NOT BINARY) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") @@ -95,7 +97,7 @@ else() endif () if (NOT BINARY) - if (X86_64 OR ARM64 OR PPC OR MIPS64) + if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64) set(BINARY 64) else () set(BINARY 32) diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL index e96a90e72..1c11df9b6 100644 --- a/kernel/loongarch64/KERNEL +++ b/kernel/loongarch64/KERNEL @@ -234,3 +234,5 @@ endif ifndef ZGEMM3MKERNEL ZGEMM3MKERNEL = zgemm3m_kernel.S endif + +DSDOTKERNEL = dot.S From 0a2077901cf94877f6173f6b580762b68b2fd2e0 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 28 Apr 2020 19:01:36 +0800 Subject: [PATCH 021/143] Add small marix optimization kernel interface. make SMALL_MATRIX_OPT=1 --- Makefile.system | 5 ++ common_d.h | 6 ++ common_level3.h | 12 ++++ common_macro.h | 16 +++++ common_s.h | 5 ++ interface/gemm.c | 28 +++++++- kernel/Makefile.L3 | 73 ++++++++++++++++++++ kernel/generic/gemm_small_matrix_kernel_nn.c | 49 +++++++++++++ kernel/generic/gemm_small_matrix_kernel_nt.c | 49 +++++++++++++ kernel/generic/gemm_small_matrix_kernel_tn.c | 49 +++++++++++++ kernel/generic/gemm_small_matrix_kernel_tt.c | 49 +++++++++++++ 11 files changed, 340 insertions(+), 1 deletion(-) create mode 100644 kernel/generic/gemm_small_matrix_kernel_nn.c create mode 100644 kernel/generic/gemm_small_matrix_kernel_nt.c create mode 100644 kernel/generic/gemm_small_matrix_kernel_tn.c create mode 100644 kernel/generic/gemm_small_matrix_kernel_tt.c diff --git a/Makefile.system b/Makefile.system index 13c946ba1..20d8d2f2a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -244,6 +244,11 @@ else ONLY_CBLAS = 0 endif +#For small matrix optimization +ifeq ($(SMALL_MATRIX_OPT), 1) +CCOMMON_OPT += -DSMALL_MATRIX_OPT +endif + # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 diff --git a/common_d.h b/common_d.h index 94dc3eea8..dad304a5f 100644 --- a/common_d.h +++ b/common_d.h @@ -157,6 +157,12 @@ #define DIMATCOPY_K_RT dimatcopy_k_rt #define DGEADD_K dgeadd_k + +#define DGEMM_SMALL_KERNEL_NN dgemm_small_kernel_nn +#define DGEMM_SMALL_KERNEL_NT dgemm_small_kernel_nt +#define DGEMM_SMALL_KERNEL_TN dgemm_small_kernel_tn +#define DGEMM_SMALL_KERNEL_TT dgemm_small_kernel_tt + #else #define DAMAX_K gotoblas -> damax_k diff --git a/common_level3.h b/common_level3.h index c4f9435a9..751592b67 100644 --- a/common_level3.h +++ b/common_level3.h @@ -515,6 +515,18 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); #endif +#ifdef SMALL_MATRIX_OPT +int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + +int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +#endif + int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 0136f18ab..eb2abcdc0 100644 --- a/common_macro.h +++ b/common_macro.h @@ -644,6 +644,11 @@ #define GEADD_K DGEADD_K +#define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT + #elif defined(BFLOAT16) #define D_TO_BF16_K SBDTOBF16_K @@ -931,6 +936,11 @@ #define GEADD_K SGEADD_K +#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT + #endif #else @@ -1236,6 +1246,12 @@ #define IMATCOPY_K_RT SIMATCOPY_K_RT #define GEADD_K SGEADD_K + +#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT + #endif #else #ifdef XDOUBLE diff --git a/common_s.h b/common_s.h index 34903ec49..6ad98ba8b 100644 --- a/common_s.h +++ b/common_s.h @@ -164,6 +164,11 @@ #define SGEADD_K sgeadd_k +#define SGEMM_SMALL_KERNEL_NN sgemm_small_kernel_nn +#define SGEMM_SMALL_KERNEL_NT sgemm_small_kernel_nt +#define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn +#define SGEMM_SMALL_KERNEL_TT sgemm_small_kernel_tt + #else #define SAMAX_K gotoblas -> samax_k diff --git a/interface/gemm.c b/interface/gemm.c index 10426fd8f..d2fb42ff7 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -105,6 +105,18 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #endif }; +#ifdef SMALL_MATRIX_OPT +//Only support s/dgemm small matrix optimiztion so far. +static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { +#ifndef GEMM3M +#ifndef COMPLEX + GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL, + GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL, +#endif +#endif +}; +#endif + #ifndef CBLAS void NAME(char *TRANSA, char *TRANSB, @@ -417,6 +429,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FUNCTION_PROFILE_START(); + MNK = (double) args.m * (double) args.n * (double) args.k; + +#ifdef SMALL_MATRIX_OPT +#if !defined(COMPLEX) + //need to tune small matrices cases. + if(MNK <= 100.0*100.0*100.0){ + (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, + args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); + return; + } +#endif +#endif + + buffer = (XFLOAT *)blas_memory_alloc(0); sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); @@ -428,7 +454,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS mode |= (transb << BLAS_TRANSB_SHIFT); #endif - MNK = (double) args.m * (double) args.n * (double) args.k; + if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) args.nthreads = 1; else diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 2d9e3ec36..88e5eb2d6 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -447,6 +447,19 @@ XBLASOBJS += \ endif +###### BLAS small matrix optimization ##### +ifeq ($(SMALL_MATRIX_OPT), 1) + +SBLASOBJS += \ + sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) + +DBLASOBJS += \ + dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) + +endif + ###### BLAS extensions ##### ifeq ($(BUILD_SINGLE),1) @@ -4237,3 +4250,63 @@ endif $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ + + +###### BLAS small matrix optimization ##### + +ifndef DGEMM_SAMLL_K_NN +DGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef DGEMM_SAMLL_K_NT +DGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef DGEMM_SAMLL_K_TN +DGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef DGEMM_SAMLL_K_TT +DGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + + +ifndef SGEMM_SAMLL_K_NN +SGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SGEMM_SAMLL_K_NT +SGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SGEMM_SAMLL_K_TN +SGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SGEMM_SAMLL_K_TT +SGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ diff --git a/kernel/generic/gemm_small_matrix_kernel_nn.c b/kernel/generic/gemm_small_matrix_kernel_nn.c new file mode 100644 index 000000000..efcc27cba --- /dev/null +++ b/kernel/generic/gemm_small_matrix_kernel_nn.c @@ -0,0 +1,49 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +{ + //naive implemtation + //Column major + + BLASLONG i,j,k; + FLOAT result=0.0; + + for(i=0; i Date: Tue, 28 Apr 2020 22:35:36 +0800 Subject: [PATCH 022/143] Add alpha=1.0 beta=0.0 for small gemm. --- common_d.h | 5 ++ common_level3.h | 11 ++++ common_macro.h | 14 ++++ common_s.h | 5 ++ interface/gemm.c | 18 +++++- kernel/Makefile.L3 | 64 ++++++++++++++++++- .../gemm_small_matrix_kernel_a1b0_nn.c | 49 ++++++++++++++ .../gemm_small_matrix_kernel_a1b0_nt.c | 49 ++++++++++++++ .../gemm_small_matrix_kernel_a1b0_tn.c | 49 ++++++++++++++ .../gemm_small_matrix_kernel_a1b0_tt.c | 49 ++++++++++++++ 10 files changed, 309 insertions(+), 4 deletions(-) create mode 100644 kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c create mode 100644 kernel/generic/gemm_small_matrix_kernel_a1b0_nt.c create mode 100644 kernel/generic/gemm_small_matrix_kernel_a1b0_tn.c create mode 100644 kernel/generic/gemm_small_matrix_kernel_a1b0_tt.c diff --git a/common_d.h b/common_d.h index dad304a5f..f5d7935fa 100644 --- a/common_d.h +++ b/common_d.h @@ -163,6 +163,11 @@ #define DGEMM_SMALL_KERNEL_TN dgemm_small_kernel_tn #define DGEMM_SMALL_KERNEL_TT dgemm_small_kernel_tt +#define DGEMM_SMALL_KERNEL_A1B0_NN dgemm_small_kernel_a1b0_nn +#define DGEMM_SMALL_KERNEL_A1B0_NT dgemm_small_kernel_a1b0_nt +#define DGEMM_SMALL_KERNEL_A1B0_TN dgemm_small_kernel_a1b0_tn +#define DGEMM_SMALL_KERNEL_A1B0_TT dgemm_small_kernel_a1b0_tt + #else #define DAMAX_K gotoblas -> damax_k diff --git a/common_level3.h b/common_level3.h index 751592b67..31d514cd5 100644 --- a/common_level3.h +++ b/common_level3.h @@ -525,6 +525,17 @@ int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLO int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + +int sgemm_small_kernel_a1b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_a1b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_a1b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_a1b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int dgemm_small_kernel_a1b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_a1b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_a1b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_a1b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + #endif int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index eb2abcdc0..2f7263023 100644 --- a/common_macro.h +++ b/common_macro.h @@ -648,6 +648,10 @@ #define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN #define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_A1B0_NN DGEMM_SMALL_KERNEL_A1B0_NN +#define GEMM_SMALL_KERNEL_A1B0_NT DGEMM_SMALL_KERNEL_A1B0_NT +#define GEMM_SMALL_KERNEL_A1B0_TN DGEMM_SMALL_KERNEL_A1B0_TN +#define GEMM_SMALL_KERNEL_A1B0_TT DGEMM_SMALL_KERNEL_A1B0_TT #elif defined(BFLOAT16) @@ -941,6 +945,11 @@ #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN #define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_A1B0_NN SGEMM_SMALL_KERNEL_A1B0_NN +#define GEMM_SMALL_KERNEL_A1B0_NT SGEMM_SMALL_KERNEL_A1B0_NT +#define GEMM_SMALL_KERNEL_A1B0_TN SGEMM_SMALL_KERNEL_A1B0_TN +#define GEMM_SMALL_KERNEL_A1B0_TT SGEMM_SMALL_KERNEL_A1B0_TT + #endif #else @@ -1252,6 +1261,11 @@ #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN #define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_A1B0_NN SGEMM_SMALL_KERNEL_A1B0_NN +#define GEMM_SMALL_KERNEL_A1B0_NT SGEMM_SMALL_KERNEL_A1B0_NT +#define GEMM_SMALL_KERNEL_A1B0_TN SGEMM_SMALL_KERNEL_A1B0_TN +#define GEMM_SMALL_KERNEL_A1B0_TT SGEMM_SMALL_KERNEL_A1B0_TT + #endif #else #ifdef XDOUBLE diff --git a/common_s.h b/common_s.h index 6ad98ba8b..440b78723 100644 --- a/common_s.h +++ b/common_s.h @@ -169,6 +169,11 @@ #define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn #define SGEMM_SMALL_KERNEL_TT sgemm_small_kernel_tt +#define SGEMM_SMALL_KERNEL_A1B0_NN sgemm_small_kernel_a1b0_nn +#define SGEMM_SMALL_KERNEL_A1B0_NT sgemm_small_kernel_a1b0_nt +#define SGEMM_SMALL_KERNEL_A1B0_TN sgemm_small_kernel_a1b0_tn +#define SGEMM_SMALL_KERNEL_A1B0_TT sgemm_small_kernel_a1b0_tt + #else #define SAMAX_K gotoblas -> samax_k diff --git a/interface/gemm.c b/interface/gemm.c index d2fb42ff7..da602f7a9 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -115,6 +115,15 @@ static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLON #endif #endif }; + +static int (*gemm_small_kernel_a1b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { +#ifndef GEMM3M +#ifndef COMPLEX + GEMM_SMALL_KERNEL_A1B0_NN, GEMM_SMALL_KERNEL_A1B0_TN, NULL, NULL, + GEMM_SMALL_KERNEL_A1B0_NT, GEMM_SMALL_KERNEL_A1B0_TT, NULL, NULL, +#endif +#endif +}; #endif #ifndef CBLAS @@ -435,8 +444,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #if !defined(COMPLEX) //need to tune small matrices cases. if(MNK <= 100.0*100.0*100.0){ - (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, - args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); + + if(*(FLOAT *)(args.alpha) == 1.0 && *(FLOAT *)(args.beta) == 0.0){ + (gemm_small_kernel_a1b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda,args.b, args.ldb, args.c, args.ldc); + }else{ + (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); + } + return; } #endif diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 88e5eb2d6..448d22e4e 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -452,11 +452,15 @@ ifeq ($(SMALL_MATRIX_OPT), 1) SBLASOBJS += \ sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) + sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) + dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) endif @@ -4282,6 +4286,34 @@ $(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_ $(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +ifndef DGEMM_SAMLL_K_A1B0_NN +DGEMM_SAMLL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c +endif + +ifndef DGEMM_SAMLL_K_A1B0_NT +DGEMM_SAMLL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c +endif + +ifndef DGEMM_SAMLL_K_A1B0_TN +DGEMM_SAMLL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c +endif + +ifndef DGEMM_SAMLL_K_A1B0_TT +DGEMM_SAMLL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c +endif + +$(KDIR)dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + ifndef SGEMM_SAMLL_K_NN SGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c @@ -4310,3 +4342,31 @@ $(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_ $(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifndef SGEMM_SAMLL_K_A1B0_NN +SGEMM_SAMLL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c +endif + +ifndef SGEMM_SAMLL_K_A1B0_NT +SGEMM_SAMLL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c +endif + +ifndef SGEMM_SAMLL_K_A1B0_TN +SGEMM_SAMLL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c +endif + +ifndef SGEMM_SAMLL_K_A1B0_TT +SGEMM_SAMLL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c +endif + +$(KDIR)sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ diff --git a/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c b/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c new file mode 100644 index 000000000..8e3417027 --- /dev/null +++ b/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c @@ -0,0 +1,49 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc) +{ + //naive implemtation + //Column major + + BLASLONG i,j,k; + FLOAT result=0.0; + + for(i=0; i Date: Tue, 28 Apr 2020 23:15:20 +0800 Subject: [PATCH 023/143] Fix gemm interface bug for small matrix. --- interface/gemm.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index da602f7a9..4f1bbfd1c 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -145,7 +145,7 @@ void NAME(char *TRANSA, char *TRANSB, IFLOAT *buffer; IFLOAT *sa, *sb; -#ifdef SMP +#if defined (SMP) || defined(SMALL_MATRIX_OPT) double MNK; #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX @@ -269,8 +269,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS XFLOAT *buffer; XFLOAT *sa, *sb; -#ifdef SMP +#if defined (SMP) || defined(SMALL_MATRIX_OPT) double MNK; +#endif + +#ifdef SMP #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX #ifdef XDOUBLE @@ -438,7 +441,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FUNCTION_PROFILE_START(); +#if defined(SMP) || defined(SMALL_MATRIX_OPT) MNK = (double) args.m * (double) args.n * (double) args.k; +#endif #ifdef SMALL_MATRIX_OPT #if !defined(COMPLEX) From 59cb5de46b89a080d1190e89bed543fd32f924c7 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 29 Apr 2020 00:19:19 +0800 Subject: [PATCH 024/143] Refs #2587 Fix typos. --- kernel/Makefile.L3 | 96 +++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 448d22e4e..6476334e9 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -4258,115 +4258,115 @@ $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) ###### BLAS small matrix optimization ##### -ifndef DGEMM_SAMLL_K_NN -DGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +ifndef DGEMM_SMALL_K_NN +DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c endif -ifndef DGEMM_SAMLL_K_NT -DGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +ifndef DGEMM_SMALL_K_NT +DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c endif -ifndef DGEMM_SAMLL_K_TN -DGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +ifndef DGEMM_SMALL_K_TN +DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c endif -ifndef DGEMM_SAMLL_K_TT -DGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +ifndef DGEMM_SMALL_K_TT +DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c endif -$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NN) +$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NT) +$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TN) +$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TT) +$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -ifndef DGEMM_SAMLL_K_A1B0_NN -DGEMM_SAMLL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c +ifndef DGEMM_SMALL_K_A1B0_NN +DGEMM_SMALL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c endif -ifndef DGEMM_SAMLL_K_A1B0_NT -DGEMM_SAMLL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c +ifndef DGEMM_SMALL_K_A1B0_NT +DGEMM_SMALL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c endif -ifndef DGEMM_SAMLL_K_A1B0_TN -DGEMM_SAMLL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c +ifndef DGEMM_SMALL_K_A1B0_TN +DGEMM_SMALL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c endif -ifndef DGEMM_SAMLL_K_A1B0_TT -DGEMM_SAMLL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c +ifndef DGEMM_SMALL_K_A1B0_TT +DGEMM_SMALL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c endif -$(KDIR)dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_NN) +$(KDIR)dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_NN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_NT) +$(KDIR)dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_NT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_TN) +$(KDIR)dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_TN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_TT) +$(KDIR)dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_TT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -ifndef SGEMM_SAMLL_K_NN -SGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +ifndef SGEMM_SMALL_K_NN +SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c endif -ifndef SGEMM_SAMLL_K_NT -SGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +ifndef SGEMM_SMALL_K_NT +SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c endif -ifndef SGEMM_SAMLL_K_TN -SGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +ifndef SGEMM_SMALL_K_TN +SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c endif -ifndef SGEMM_SAMLL_K_TT -SGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +ifndef SGEMM_SMALL_K_TT +SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c endif -$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NN) +$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NT) +$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TN) +$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TT) +$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -ifndef SGEMM_SAMLL_K_A1B0_NN -SGEMM_SAMLL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c +ifndef SGEMM_SMALL_K_A1B0_NN +SGEMM_SMALL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c endif -ifndef SGEMM_SAMLL_K_A1B0_NT -SGEMM_SAMLL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c +ifndef SGEMM_SMALL_K_A1B0_NT +SGEMM_SMALL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c endif -ifndef SGEMM_SAMLL_K_A1B0_TN -SGEMM_SAMLL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c +ifndef SGEMM_SMALL_K_A1B0_TN +SGEMM_SMALL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c endif -ifndef SGEMM_SAMLL_K_A1B0_TT -SGEMM_SAMLL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c +ifndef SGEMM_SMALL_K_A1B0_TT +SGEMM_SMALL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c endif -$(KDIR)sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_NN) +$(KDIR)sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_NN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_NT) +$(KDIR)sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_NT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_TN) +$(KDIR)sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_TN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_TT) +$(KDIR)sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ From 17d32a4a8271141be2fb96c8c767ac1ed2e60a36 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 28 Aug 2020 07:55:27 +0800 Subject: [PATCH 025/143] Change a1b0 gemm to b0 gemm. --- common_d.h | 8 +-- common_level3.h | 18 +++--- common_macro.h | 24 ++++---- common_s.h | 8 +-- interface/gemm.c | 10 ++-- kernel/Makefile.L3 | 56 +++++++++---------- ..._nn.c => gemm_small_matrix_kernel_b0_nn.c} | 4 +- ..._nt.c => gemm_small_matrix_kernel_b0_nt.c} | 4 +- ..._tn.c => gemm_small_matrix_kernel_b0_tn.c} | 4 +- ..._tt.c => gemm_small_matrix_kernel_b0_tt.c} | 4 +- 10 files changed, 70 insertions(+), 70 deletions(-) rename kernel/generic/{gemm_small_matrix_kernel_a1b0_nn.c => gemm_small_matrix_kernel_b0_nn.c} (95%) rename kernel/generic/{gemm_small_matrix_kernel_a1b0_nt.c => gemm_small_matrix_kernel_b0_nt.c} (95%) rename kernel/generic/{gemm_small_matrix_kernel_a1b0_tn.c => gemm_small_matrix_kernel_b0_tn.c} (95%) rename kernel/generic/{gemm_small_matrix_kernel_a1b0_tt.c => gemm_small_matrix_kernel_b0_tt.c} (95%) diff --git a/common_d.h b/common_d.h index f5d7935fa..42c14e828 100644 --- a/common_d.h +++ b/common_d.h @@ -163,10 +163,10 @@ #define DGEMM_SMALL_KERNEL_TN dgemm_small_kernel_tn #define DGEMM_SMALL_KERNEL_TT dgemm_small_kernel_tt -#define DGEMM_SMALL_KERNEL_A1B0_NN dgemm_small_kernel_a1b0_nn -#define DGEMM_SMALL_KERNEL_A1B0_NT dgemm_small_kernel_a1b0_nt -#define DGEMM_SMALL_KERNEL_A1B0_TN dgemm_small_kernel_a1b0_tn -#define DGEMM_SMALL_KERNEL_A1B0_TT dgemm_small_kernel_a1b0_tt +#define DGEMM_SMALL_KERNEL_B0_NN dgemm_small_kernel_b0_nn +#define DGEMM_SMALL_KERNEL_B0_NT dgemm_small_kernel_b0_nt +#define DGEMM_SMALL_KERNEL_B0_TN dgemm_small_kernel_b0_tn +#define DGEMM_SMALL_KERNEL_B0_TT dgemm_small_kernel_b0_tt #else diff --git a/common_level3.h b/common_level3.h index 31d514cd5..7be7ab06b 100644 --- a/common_level3.h +++ b/common_level3.h @@ -526,15 +526,15 @@ int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLO int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); -int sgemm_small_kernel_a1b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int sgemm_small_kernel_a1b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int sgemm_small_kernel_a1b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int sgemm_small_kernel_a1b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); - -int dgemm_small_kernel_a1b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int dgemm_small_kernel_a1b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int dgemm_small_kernel_a1b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int dgemm_small_kernel_a1b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int dgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); #endif diff --git a/common_macro.h b/common_macro.h index 2f7263023..fa7884180 100644 --- a/common_macro.h +++ b/common_macro.h @@ -648,10 +648,10 @@ #define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN #define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT -#define GEMM_SMALL_KERNEL_A1B0_NN DGEMM_SMALL_KERNEL_A1B0_NN -#define GEMM_SMALL_KERNEL_A1B0_NT DGEMM_SMALL_KERNEL_A1B0_NT -#define GEMM_SMALL_KERNEL_A1B0_TN DGEMM_SMALL_KERNEL_A1B0_TN -#define GEMM_SMALL_KERNEL_A1B0_TT DGEMM_SMALL_KERNEL_A1B0_TT +#define GEMM_SMALL_KERNEL_B0_NN DGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT DGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_TN DGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT DGEMM_SMALL_KERNEL_B0_TT #elif defined(BFLOAT16) @@ -945,10 +945,10 @@ #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN #define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT -#define GEMM_SMALL_KERNEL_A1B0_NN SGEMM_SMALL_KERNEL_A1B0_NN -#define GEMM_SMALL_KERNEL_A1B0_NT SGEMM_SMALL_KERNEL_A1B0_NT -#define GEMM_SMALL_KERNEL_A1B0_TN SGEMM_SMALL_KERNEL_A1B0_TN -#define GEMM_SMALL_KERNEL_A1B0_TT SGEMM_SMALL_KERNEL_A1B0_TT +#define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT #endif @@ -1261,10 +1261,10 @@ #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN #define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT -#define GEMM_SMALL_KERNEL_A1B0_NN SGEMM_SMALL_KERNEL_A1B0_NN -#define GEMM_SMALL_KERNEL_A1B0_NT SGEMM_SMALL_KERNEL_A1B0_NT -#define GEMM_SMALL_KERNEL_A1B0_TN SGEMM_SMALL_KERNEL_A1B0_TN -#define GEMM_SMALL_KERNEL_A1B0_TT SGEMM_SMALL_KERNEL_A1B0_TT +#define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT #endif #else diff --git a/common_s.h b/common_s.h index 440b78723..685d73062 100644 --- a/common_s.h +++ b/common_s.h @@ -169,10 +169,10 @@ #define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn #define SGEMM_SMALL_KERNEL_TT sgemm_small_kernel_tt -#define SGEMM_SMALL_KERNEL_A1B0_NN sgemm_small_kernel_a1b0_nn -#define SGEMM_SMALL_KERNEL_A1B0_NT sgemm_small_kernel_a1b0_nt -#define SGEMM_SMALL_KERNEL_A1B0_TN sgemm_small_kernel_a1b0_tn -#define SGEMM_SMALL_KERNEL_A1B0_TT sgemm_small_kernel_a1b0_tt +#define SGEMM_SMALL_KERNEL_B0_NN sgemm_small_kernel_b0_nn +#define SGEMM_SMALL_KERNEL_B0_NT sgemm_small_kernel_b0_nt +#define SGEMM_SMALL_KERNEL_B0_TN sgemm_small_kernel_b0_tn +#define SGEMM_SMALL_KERNEL_B0_TT sgemm_small_kernel_b0_tt #else diff --git a/interface/gemm.c b/interface/gemm.c index 4f1bbfd1c..3730f37fa 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -116,11 +116,11 @@ static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLON #endif }; -static int (*gemm_small_kernel_a1b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { +static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { #ifndef GEMM3M #ifndef COMPLEX - GEMM_SMALL_KERNEL_A1B0_NN, GEMM_SMALL_KERNEL_A1B0_TN, NULL, NULL, - GEMM_SMALL_KERNEL_A1B0_NT, GEMM_SMALL_KERNEL_A1B0_TT, NULL, NULL, + GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, NULL, NULL, + GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, NULL, NULL, #endif #endif }; @@ -450,8 +450,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS //need to tune small matrices cases. if(MNK <= 100.0*100.0*100.0){ - if(*(FLOAT *)(args.alpha) == 1.0 && *(FLOAT *)(args.beta) == 0.0){ - (gemm_small_kernel_a1b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda,args.b, args.ldb, args.c, args.ldc); + if(*(FLOAT *)(args.beta) == 0.0){ + (gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); }else{ (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); } diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 6476334e9..c9544086a 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -453,14 +453,14 @@ ifeq ($(SMALL_MATRIX_OPT), 1) SBLASOBJS += \ sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) + sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) + dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) endif @@ -4286,32 +4286,32 @@ $(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_ $(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -ifndef DGEMM_SMALL_K_A1B0_NN -DGEMM_SMALL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c +ifndef DGEMM_SMALL_K_B0_NN +DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c endif -ifndef DGEMM_SMALL_K_A1B0_NT -DGEMM_SMALL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c +ifndef DGEMM_SMALL_K_B0_NT +DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c endif -ifndef DGEMM_SMALL_K_A1B0_TN -DGEMM_SMALL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c +ifndef DGEMM_SMALL_K_B0_TN +DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c endif -ifndef DGEMM_SMALL_K_A1B0_TT -DGEMM_SMALL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c +ifndef DGEMM_SMALL_K_B0_TT +DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c endif -$(KDIR)dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_NN) +$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_NT) +$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_TN) +$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_TT) +$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -4343,30 +4343,30 @@ $(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_ $(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -ifndef SGEMM_SMALL_K_A1B0_NN -SGEMM_SMALL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c +ifndef SGEMM_SMALL_K_B0_NN +SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c endif -ifndef SGEMM_SMALL_K_A1B0_NT -SGEMM_SMALL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c +ifndef SGEMM_SMALL_K_B0_NT +SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c endif -ifndef SGEMM_SMALL_K_A1B0_TN -SGEMM_SMALL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c +ifndef SGEMM_SMALL_K_B0_TN +SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c endif -ifndef SGEMM_SMALL_K_A1B0_TT -SGEMM_SMALL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c +ifndef SGEMM_SMALL_K_B0_TT +SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c endif -$(KDIR)sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_NN) +$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_NT) +$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_TN) +$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_TT) +$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ diff --git a/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c b/kernel/generic/gemm_small_matrix_kernel_b0_nn.c similarity index 95% rename from kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c rename to kernel/generic/gemm_small_matrix_kernel_b0_nn.c index 8e3417027..3be918017 100644 --- a/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c +++ b/kernel/generic/gemm_small_matrix_kernel_b0_nn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc) { //naive implemtation //Column major @@ -41,7 +41,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B for(k=0; k Date: Fri, 28 Aug 2020 21:00:54 +0800 Subject: [PATCH 026/143] Refs #2587 Add small matrix optimization reference kernel for c/zgemm. --- common_c.h | 40 +++ common_level3.h | 80 +++++ common_macro.h | 80 +++++ common_z.h | 40 +++ interface/gemm.c | 35 ++- kernel/Makefile.L3 | 293 ++++++++++++++++++ .../generic/zgemm_small_matrix_kernel_b0_nn.c | 74 +++++ .../generic/zgemm_small_matrix_kernel_b0_nt.c | 77 +++++ .../generic/zgemm_small_matrix_kernel_b0_tn.c | 77 +++++ .../generic/zgemm_small_matrix_kernel_b0_tt.c | 77 +++++ kernel/generic/zgemm_small_matrix_kernel_nn.c | 78 +++++ kernel/generic/zgemm_small_matrix_kernel_nt.c | 82 +++++ kernel/generic/zgemm_small_matrix_kernel_tn.c | 82 +++++ kernel/generic/zgemm_small_matrix_kernel_tt.c | 82 +++++ 14 files changed, 1193 insertions(+), 4 deletions(-) create mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_nn.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_nt.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_tn.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_tt.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_nn.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_nt.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_tn.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_tt.c diff --git a/common_c.h b/common_c.h index 40ecf5b8b..9388ece93 100644 --- a/common_c.h +++ b/common_c.h @@ -232,6 +232,46 @@ #define CGEADD_K cgeadd_k +#define CGEMM_SMALL_KERNEL_NN cgemm_small_kernel_nn +#define CGEMM_SMALL_KERNEL_NT cgemm_small_kernel_nt +#define CGEMM_SMALL_KERNEL_NR cgemm_small_kernel_nr +#define CGEMM_SMALL_KERNEL_NC cgemm_small_kernel_nc + +#define CGEMM_SMALL_KERNEL_TN cgemm_small_kernel_tn +#define CGEMM_SMALL_KERNEL_TT cgemm_small_kernel_tt +#define CGEMM_SMALL_KERNEL_TR cgemm_small_kernel_tr +#define CGEMM_SMALL_KERNEL_TC cgemm_small_kernel_tc + +#define CGEMM_SMALL_KERNEL_RN cgemm_small_kernel_rn +#define CGEMM_SMALL_KERNEL_RT cgemm_small_kernel_rt +#define CGEMM_SMALL_KERNEL_RR cgemm_small_kernel_rr +#define CGEMM_SMALL_KERNEL_RC cgemm_small_kernel_rc + +#define CGEMM_SMALL_KERNEL_CN cgemm_small_kernel_cn +#define CGEMM_SMALL_KERNEL_CT cgemm_small_kernel_ct +#define CGEMM_SMALL_KERNEL_CR cgemm_small_kernel_cr +#define CGEMM_SMALL_KERNEL_CC cgemm_small_kernel_cc + +#define CGEMM_SMALL_KERNEL_B0_NN cgemm_small_kernel_b0_nn +#define CGEMM_SMALL_KERNEL_B0_NT cgemm_small_kernel_b0_nt +#define CGEMM_SMALL_KERNEL_B0_NR cgemm_small_kernel_b0_nr +#define CGEMM_SMALL_KERNEL_B0_NC cgemm_small_kernel_b0_nc + +#define CGEMM_SMALL_KERNEL_B0_TN cgemm_small_kernel_b0_tn +#define CGEMM_SMALL_KERNEL_B0_TT cgemm_small_kernel_b0_tt +#define CGEMM_SMALL_KERNEL_B0_TR cgemm_small_kernel_b0_tr +#define CGEMM_SMALL_KERNEL_B0_TC cgemm_small_kernel_b0_tc + +#define CGEMM_SMALL_KERNEL_B0_RN cgemm_small_kernel_b0_rn +#define CGEMM_SMALL_KERNEL_B0_RT cgemm_small_kernel_b0_rt +#define CGEMM_SMALL_KERNEL_B0_RR cgemm_small_kernel_b0_rr +#define CGEMM_SMALL_KERNEL_B0_RC cgemm_small_kernel_b0_rc + +#define CGEMM_SMALL_KERNEL_B0_CN cgemm_small_kernel_b0_cn +#define CGEMM_SMALL_KERNEL_B0_CT cgemm_small_kernel_b0_ct +#define CGEMM_SMALL_KERNEL_B0_CR cgemm_small_kernel_b0_cr +#define CGEMM_SMALL_KERNEL_B0_CC cgemm_small_kernel_b0_cc + #else #define CAMAX_K gotoblas -> camax_k diff --git a/common_level3.h b/common_level3.h index 7be7ab06b..5741f56d5 100644 --- a/common_level3.h +++ b/common_level3.h @@ -536,6 +536,86 @@ int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLA int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); + +int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); + +int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); + +int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); + +int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); + +int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); + +int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); + +int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + #endif int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index fa7884180..2cccf9b39 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2093,6 +2093,46 @@ #define GEADD_K ZGEADD_K +#define GEMM_SMALL_KERNEL_NN ZGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT ZGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_NR ZGEMM_SMALL_KERNEL_NR +#define GEMM_SMALL_KERNEL_NC ZGEMM_SMALL_KERNEL_NC + +#define GEMM_SMALL_KERNEL_TN ZGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT ZGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_TR ZGEMM_SMALL_KERNEL_TR +#define GEMM_SMALL_KERNEL_TC ZGEMM_SMALL_KERNEL_TC + +#define GEMM_SMALL_KERNEL_RN ZGEMM_SMALL_KERNEL_RN +#define GEMM_SMALL_KERNEL_RT ZGEMM_SMALL_KERNEL_RT +#define GEMM_SMALL_KERNEL_RR ZGEMM_SMALL_KERNEL_RR +#define GEMM_SMALL_KERNEL_RC ZGEMM_SMALL_KERNEL_RC + +#define GEMM_SMALL_KERNEL_CN ZGEMM_SMALL_KERNEL_CN +#define GEMM_SMALL_KERNEL_CT ZGEMM_SMALL_KERNEL_CT +#define GEMM_SMALL_KERNEL_CR ZGEMM_SMALL_KERNEL_CR +#define GEMM_SMALL_KERNEL_CC ZGEMM_SMALL_KERNEL_CC + +#define GEMM_SMALL_KERNEL_B0_NN ZGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT ZGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_NR ZGEMM_SMALL_KERNEL_B0_NR +#define GEMM_SMALL_KERNEL_B0_NC ZGEMM_SMALL_KERNEL_B0_NC + +#define GEMM_SMALL_KERNEL_B0_TN ZGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT ZGEMM_SMALL_KERNEL_B0_TT +#define GEMM_SMALL_KERNEL_B0_TR ZGEMM_SMALL_KERNEL_B0_TR +#define GEMM_SMALL_KERNEL_B0_TC ZGEMM_SMALL_KERNEL_B0_TC + +#define GEMM_SMALL_KERNEL_B0_RN ZGEMM_SMALL_KERNEL_B0_RN +#define GEMM_SMALL_KERNEL_B0_RT ZGEMM_SMALL_KERNEL_B0_RT +#define GEMM_SMALL_KERNEL_B0_RR ZGEMM_SMALL_KERNEL_B0_RR +#define GEMM_SMALL_KERNEL_B0_RC ZGEMM_SMALL_KERNEL_B0_RC + +#define GEMM_SMALL_KERNEL_B0_CN ZGEMM_SMALL_KERNEL_B0_CN +#define GEMM_SMALL_KERNEL_B0_CT ZGEMM_SMALL_KERNEL_B0_CT +#define GEMM_SMALL_KERNEL_B0_CR ZGEMM_SMALL_KERNEL_B0_CR +#define GEMM_SMALL_KERNEL_B0_CC ZGEMM_SMALL_KERNEL_B0_CC + #else #define AMAX_K CAMAX_K @@ -2516,6 +2556,46 @@ #define GEADD_K CGEADD_K +#define GEMM_SMALL_KERNEL_NN CGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT CGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_NR CGEMM_SMALL_KERNEL_NR +#define GEMM_SMALL_KERNEL_NC CGEMM_SMALL_KERNEL_NC + +#define GEMM_SMALL_KERNEL_TN CGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT CGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_TR CGEMM_SMALL_KERNEL_TR +#define GEMM_SMALL_KERNEL_TC CGEMM_SMALL_KERNEL_TC + +#define GEMM_SMALL_KERNEL_RN CGEMM_SMALL_KERNEL_RN +#define GEMM_SMALL_KERNEL_RT CGEMM_SMALL_KERNEL_RT +#define GEMM_SMALL_KERNEL_RR CGEMM_SMALL_KERNEL_RR +#define GEMM_SMALL_KERNEL_RC CGEMM_SMALL_KERNEL_RC + +#define GEMM_SMALL_KERNEL_CN CGEMM_SMALL_KERNEL_CN +#define GEMM_SMALL_KERNEL_CT CGEMM_SMALL_KERNEL_CT +#define GEMM_SMALL_KERNEL_CR CGEMM_SMALL_KERNEL_CR +#define GEMM_SMALL_KERNEL_CC CGEMM_SMALL_KERNEL_CC + +#define GEMM_SMALL_KERNEL_B0_NN CGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT CGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_NR CGEMM_SMALL_KERNEL_B0_NR +#define GEMM_SMALL_KERNEL_B0_NC CGEMM_SMALL_KERNEL_B0_NC + +#define GEMM_SMALL_KERNEL_B0_TN CGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT CGEMM_SMALL_KERNEL_B0_TT +#define GEMM_SMALL_KERNEL_B0_TR CGEMM_SMALL_KERNEL_B0_TR +#define GEMM_SMALL_KERNEL_B0_TC CGEMM_SMALL_KERNEL_B0_TC + +#define GEMM_SMALL_KERNEL_B0_RN CGEMM_SMALL_KERNEL_B0_RN +#define GEMM_SMALL_KERNEL_B0_RT CGEMM_SMALL_KERNEL_B0_RT +#define GEMM_SMALL_KERNEL_B0_RR CGEMM_SMALL_KERNEL_B0_RR +#define GEMM_SMALL_KERNEL_B0_RC CGEMM_SMALL_KERNEL_B0_RC + +#define GEMM_SMALL_KERNEL_B0_CN CGEMM_SMALL_KERNEL_B0_CN +#define GEMM_SMALL_KERNEL_B0_CT CGEMM_SMALL_KERNEL_B0_CT +#define GEMM_SMALL_KERNEL_B0_CR CGEMM_SMALL_KERNEL_B0_CR +#define GEMM_SMALL_KERNEL_B0_CC CGEMM_SMALL_KERNEL_B0_CC + #endif #endif diff --git a/common_z.h b/common_z.h index f1e78dd08..8594ec74d 100644 --- a/common_z.h +++ b/common_z.h @@ -232,6 +232,46 @@ #define ZGEADD_K zgeadd_k +#define ZGEMM_SMALL_KERNEL_NN zgemm_small_kernel_nn +#define ZGEMM_SMALL_KERNEL_NT zgemm_small_kernel_nt +#define ZGEMM_SMALL_KERNEL_NR zgemm_small_kernel_nr +#define ZGEMM_SMALL_KERNEL_NC zgemm_small_kernel_nc + +#define ZGEMM_SMALL_KERNEL_TN zgemm_small_kernel_tn +#define ZGEMM_SMALL_KERNEL_TT zgemm_small_kernel_tt +#define ZGEMM_SMALL_KERNEL_TR zgemm_small_kernel_tr +#define ZGEMM_SMALL_KERNEL_TC zgemm_small_kernel_tc + +#define ZGEMM_SMALL_KERNEL_RN zgemm_small_kernel_rn +#define ZGEMM_SMALL_KERNEL_RT zgemm_small_kernel_rt +#define ZGEMM_SMALL_KERNEL_RR zgemm_small_kernel_rr +#define ZGEMM_SMALL_KERNEL_RC zgemm_small_kernel_rc + +#define ZGEMM_SMALL_KERNEL_CN zgemm_small_kernel_cn +#define ZGEMM_SMALL_KERNEL_CT zgemm_small_kernel_ct +#define ZGEMM_SMALL_KERNEL_CR zgemm_small_kernel_cr +#define ZGEMM_SMALL_KERNEL_CC zgemm_small_kernel_cc + +#define ZGEMM_SMALL_KERNEL_B0_NN zgemm_small_kernel_b0_nn +#define ZGEMM_SMALL_KERNEL_B0_NT zgemm_small_kernel_b0_nt +#define ZGEMM_SMALL_KERNEL_B0_NR zgemm_small_kernel_b0_nr +#define ZGEMM_SMALL_KERNEL_B0_NC zgemm_small_kernel_b0_nc + +#define ZGEMM_SMALL_KERNEL_B0_TN zgemm_small_kernel_b0_tn +#define ZGEMM_SMALL_KERNEL_B0_TT zgemm_small_kernel_b0_tt +#define ZGEMM_SMALL_KERNEL_B0_TR zgemm_small_kernel_b0_tr +#define ZGEMM_SMALL_KERNEL_B0_TC zgemm_small_kernel_b0_tc + +#define ZGEMM_SMALL_KERNEL_B0_RN zgemm_small_kernel_b0_rn +#define ZGEMM_SMALL_KERNEL_B0_RT zgemm_small_kernel_b0_rt +#define ZGEMM_SMALL_KERNEL_B0_RR zgemm_small_kernel_b0_rr +#define ZGEMM_SMALL_KERNEL_B0_RC zgemm_small_kernel_b0_rc + +#define ZGEMM_SMALL_KERNEL_B0_CN zgemm_small_kernel_b0_cn +#define ZGEMM_SMALL_KERNEL_B0_CT zgemm_small_kernel_b0_ct +#define ZGEMM_SMALL_KERNEL_B0_CR zgemm_small_kernel_b0_cr +#define ZGEMM_SMALL_KERNEL_B0_CC zgemm_small_kernel_b0_cc + #else #define ZAMAX_K gotoblas -> zamax_k diff --git a/interface/gemm.c b/interface/gemm.c index 3730f37fa..b73baa9bd 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -124,6 +124,28 @@ static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLAS #endif #endif }; + +static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *,FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG) = { +#ifndef GEMM3M +#ifdef COMPLEX + GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, + GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, + GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR, + GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC, +#endif +#endif +}; + +static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { +#ifndef GEMM3M +#ifdef COMPLEX + GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, + GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, + GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR, + GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, +#endif +#endif +}; #endif #ifndef CBLAS @@ -446,20 +468,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif #ifdef SMALL_MATRIX_OPT -#if !defined(COMPLEX) //need to tune small matrices cases. if(MNK <= 100.0*100.0*100.0){ - + +#if !defined(COMPLEX) if(*(FLOAT *)(args.beta) == 0.0){ (gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); }else{ (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); } - +#else + if(beta[0] == 0.0 && beta[1] == 0.0){ + (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); + }else{ + (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, (FLOAT *)(args.beta), args.c, args.ldc); + } +#endif return; } #endif -#endif buffer = (XFLOAT *)blas_memory_alloc(0); diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index c9544086a..1c4a00158 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -462,6 +462,42 @@ DBLASOBJS += \ dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) +CBLASOBJS += \ + cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += \ + zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) + endif ###### BLAS extensions ##### @@ -4370,3 +4406,260 @@ $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + + +ifndef CGEMM_SMALL_K_NN +CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c +endif + +ifndef CGEMM_SMALL_K_NT +CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c +endif + +ifndef CGEMM_SMALL_K_TN +CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c +endif + +ifndef CGEMM_SMALL_K_TT +CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c +endif + +$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@ + +$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@ + +$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@ + +$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@ + +$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ + +$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@ + +$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ + +$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@ + +$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@ + +$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ + +$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@ + +$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@ + +$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + +ifndef CGEMM_SMALL_K_B0_NN +CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c +endif + +ifndef CGEMM_SMALL_K_B0_NT +CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c +endif + +ifndef CGEMM_SMALL_K_B0_TN +CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c +endif + +ifndef CGEMM_SMALL_K_B0_TT +CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c +endif + +$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + +ifndef ZGEMM_SMALL_K_NN +ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c +endif + +ifndef ZGEMM_SMALL_K_NT +ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c +endif + +ifndef ZGEMM_SMALL_K_TN +ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c +endif + +ifndef ZGEMM_SMALL_K_TT +ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c +endif + +$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@ + +$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@ + +$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@ + +$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@ + +$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ + +$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@ + +$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ + +$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@ + +$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@ + +$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ + +$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@ + +$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@ + +$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + +ifndef ZGEMM_SMALL_K_B0_NN +ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c +endif + +ifndef ZGEMM_SMALL_K_B0_NT +ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c +endif + +ifndef ZGEMM_SMALL_K_B0_TN +ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c +endif + +ifndef ZGEMM_SMALL_K_B0_TT +ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c +endif + +$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c new file mode 100644 index 000000000..11e746e52 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(NN) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#elif defined(NR) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#elif defined(RN) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#elif defined(RR) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#endif + } + + C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c new file mode 100644 index 000000000..1ef743017 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(NT) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#elif defined(NC) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#elif defined(RT) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#elif defined(RC) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#endif + } + + C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c new file mode 100644 index 000000000..2cd3ebcf2 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(TN) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#elif defined(TR) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#elif defined(CN) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#elif defined(CR) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#endif + } + + C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c new file mode 100644 index 000000000..25b05b4aa --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(TT) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#elif defined(TC) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#elif defined(CT) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#elif defined(CC) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#endif + } + + C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_nn.c b/kernel/generic/zgemm_small_matrix_kernel_nn.c new file mode 100644 index 000000000..6ef1b9655 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_nn.c @@ -0,0 +1,78 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + FLOAT tmp0, tmp1; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(NN) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#elif defined(NR) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#elif defined(RN) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#elif defined(RR) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#endif + } + + tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + + + C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_nt.c b/kernel/generic/zgemm_small_matrix_kernel_nt.c new file mode 100644 index 000000000..3c81ad79e --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_nt.c @@ -0,0 +1,82 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + FLOAT tmp0, tmp1; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(NT) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#elif defined(NC) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#elif defined(RT) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#elif defined(RC) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#endif + } + + tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + + + C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_tn.c b/kernel/generic/zgemm_small_matrix_kernel_tn.c new file mode 100644 index 000000000..143190bb1 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_tn.c @@ -0,0 +1,82 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + FLOAT tmp0, tmp1; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(TN) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#elif defined(TR) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#elif defined(CN) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#elif defined(CR) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#endif + } + + tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + + + C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_tt.c b/kernel/generic/zgemm_small_matrix_kernel_tt.c new file mode 100644 index 000000000..246e26e84 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_tt.c @@ -0,0 +1,82 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + FLOAT tmp0, tmp1; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(TT) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#elif defined(TC) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#elif defined(CT) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#elif defined(CC) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#endif + } + + tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + + + C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} From 6022e5629c7708b114a3c2387e652ebd32122300 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 28 Aug 2020 22:36:36 +0800 Subject: [PATCH 027/143] Refs #2587 fix small matrix c/zgemm bug. --- common_level3.h | 150 +++++++++--------- interface/gemm.c | 22 ++- .../generic/zgemm_small_matrix_kernel_b0_nn.c | 6 +- .../generic/zgemm_small_matrix_kernel_b0_nt.c | 6 +- .../generic/zgemm_small_matrix_kernel_b0_tn.c | 6 +- .../generic/zgemm_small_matrix_kernel_b0_tt.c | 6 +- kernel/generic/zgemm_small_matrix_kernel_nn.c | 10 +- kernel/generic/zgemm_small_matrix_kernel_nt.c | 10 +- kernel/generic/zgemm_small_matrix_kernel_tn.c | 10 +- kernel/generic/zgemm_small_matrix_kernel_tt.c | 10 +- 10 files changed, 116 insertions(+), 120 deletions(-) diff --git a/common_level3.h b/common_level3.h index 5741f56d5..a3a487dab 100644 --- a/common_level3.h +++ b/common_level3.h @@ -536,85 +536,85 @@ int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLA int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); -int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); - -int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); - -int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); - -int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); -int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); - -int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); - -int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); - -int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); - -int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); - -int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); - -int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); - -int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); - -int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); #endif diff --git a/interface/gemm.c b/interface/gemm.c index b73baa9bd..7251993ee 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -106,47 +106,43 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B }; #ifdef SMALL_MATRIX_OPT -//Only support s/dgemm small matrix optimiztion so far. + +#ifndef COMPLEX static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { #ifndef GEMM3M -#ifndef COMPLEX GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL, GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL, #endif -#endif }; static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { #ifndef GEMM3M -#ifndef COMPLEX GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, NULL, NULL, GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, NULL, NULL, #endif -#endif }; -static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *,FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG) = { +#else + +static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG) = { #ifndef GEMM3M -#ifdef COMPLEX GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR, GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC, #endif -#endif }; -static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { +static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { #ifndef GEMM3M -#ifdef COMPLEX GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR, GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, #endif -#endif }; #endif +#endif #ifndef CBLAS @@ -479,9 +475,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS } #else if(beta[0] == 0.0 && beta[1] == 0.0){ - (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); + (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc); }else{ - (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, (FLOAT *)(args.beta), args.c, args.ldc); + (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc); } #endif return; diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c index 11e746e52..3ab057fef 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; @@ -65,8 +65,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c index 1ef743017..dc35f4a6d 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; int i, j, l; @@ -68,8 +68,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c index 2cd3ebcf2..479a56e8f 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; int i, j, l; @@ -68,8 +68,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c index 25b05b4aa..b698973dd 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; int i, j, l; @@ -68,8 +68,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_nn.c b/kernel/generic/zgemm_small_matrix_kernel_nn.c index 6ef1b9655..4bf6bf7ee 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_nn.c +++ b/kernel/generic/zgemm_small_matrix_kernel_nn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; FLOAT tmp0, tmp1; @@ -65,12 +65,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; - tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; - C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_nt.c b/kernel/generic/zgemm_small_matrix_kernel_nt.c index 3c81ad79e..288e49c13 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_nt.c +++ b/kernel/generic/zgemm_small_matrix_kernel_nt.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; FLOAT tmp0, tmp1; @@ -69,12 +69,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; - tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; - C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_tn.c b/kernel/generic/zgemm_small_matrix_kernel_tn.c index 143190bb1..1e2a5aed4 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_tn.c +++ b/kernel/generic/zgemm_small_matrix_kernel_tn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; FLOAT tmp0, tmp1; @@ -69,12 +69,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; - tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; - C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_tt.c b/kernel/generic/zgemm_small_matrix_kernel_tt.c index 246e26e84..180043539 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_tt.c +++ b/kernel/generic/zgemm_small_matrix_kernel_tt.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; FLOAT tmp0, tmp1; @@ -69,12 +69,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; - tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; - C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; } } From 9186456a1297f7ee97bae56370c404114933a5ee Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Sat, 8 May 2021 10:45:10 +0000 Subject: [PATCH 028/143] small matrix: SkylakeX: add SGEMM NN kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../sgemm_small_kernel_b0_nn_skylakex.c | 2 + .../x86_64/sgemm_small_kernel_nn_skylakex.c | 424 ++++++++++++++++++ 3 files changed, 428 insertions(+) create mode 100644 kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c create mode 100644 kernel/x86_64/sgemm_small_kernel_nn_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 3d71584fe..1a2e67b52 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -10,6 +10,8 @@ STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c +SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c diff --git a/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c new file mode 100644 index 000000000..704e964b8 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sgemm_small_kernel_nn_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c new file mode 100644 index 000000000..f2c79873e --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -0,0 +1,424 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include + +#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() +#define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)]) +#define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) +#if defined(B0) +#define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) +#else +#define STORE_512(M, N) \ + BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \ + result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512)); \ + _mm512_storeu_ps(&C[offset##M##N], result##M##N) +#endif + +#define DECLARE_RESULT_256(M, N) __m256 result##M##N = _mm256_setzero_ps() +#define LOAD_A_256(M, N) __m256 Aval##M = _mm256_loadu_ps(&A[lda * k + i + (M*8)]) +#define BROADCAST_LOAD_B_256(M, N) __m256 Bval##N = _mm256_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) +#define MATMUL_256(M, N) result##M##N = _mm256_fmadd_ps(Aval##M, Bval##N, result##M##N) +#if defined(B0) +#define STORE_256(M, N) result##M##N = _mm256_mul_ps(result##M##N, alpha_256); \ + _mm256_storeu_ps(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#else +#define STORE_256(M, N) \ + BLASLONG offset##M##N = (j+N)*ldc + i + (M*8); \ + result##M##N = _mm256_mul_ps(result##M##N, alpha_256); \ + asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_256)); \ + _mm256_storeu_ps(&C[offset##M##N], result##M##N) +#endif + +#define DECLARE_RESULT_128(M, N) __m128 result##M##N; asm("vpxorq %0, %0, %0": "+v"(result##M##N):) +#define LOAD_A_128(M, N) __m128 Aval##M = _mm_maskz_loadu_ps(mask, &A[lda * k + i + (M*4)]) +#define BROADCAST_LOAD_B_128(M, N) __m128 Bval##N = _mm_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) +#define MATMUL_128(M, N) result##M##N = _mm_fmadd_ps(Aval##M, Bval##N, result##M##N) +#if defined(B0) +#define STORE_128(M, N) result##M##N = _mm_maskz_mul_ps(mask, result##M##N, alpha_128); \ + _mm_mask_storeu_ps(&C[(j+N)*ldc + i + (M*4)], mask, result##M##N) +#else +#define STORE_128(M, N) \ + BLASLONG offset##M##N = (j+N)*ldc + i + (M*4); \ + result##M##N = _mm_maskz_mul_ps(mask, result##M##N, alpha_128); \ + asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_128)); \ + _mm_mask_storeu_ps(&C[offset##M##N], mask, result##M##N) +#endif + +#define DECLARE_RESULT_S(M, N) float result##M##N = 0; +#define LOAD_A_S(M, N) float Aval##M = A[lda * k + i + M] +#define BROADCAST_LOAD_B_S(M, N) float Bval##N = B[k + ldb * (j+N)] +#define MATMUL_S(M, N) result##M##N += Aval##M * Bval##N +#if defined(B0) +#define STORE_S(M, N) C[(j+N)*ldc + i + M] = result##M##N * alpha +#else +#define STORE_S(M, N) C[(j+N)*ldc + i + M] = result##M##N * alpha + C[(j+N)*ldc + i + M] * beta +#endif + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m64 = M & ~63; + BLASLONG m32 = M & ~31; + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + __mmask8 mask = 0xff; // just use to avoid SSE instruction + + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); +#endif + + for (i = 0; i < m64; i += 64) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + } + for (; i < m32; i += 32) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + } + for (; i < m16; i += 16) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + } + __m256 alpha_256 = _mm256_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta)); +#endif + for (; i < m8; i += 8) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + DECLARE_RESULT_256(0, 2); + DECLARE_RESULT_256(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_256(0, x); + BROADCAST_LOAD_B_256(x, 0); BROADCAST_LOAD_B_256(x, 1); + BROADCAST_LOAD_B_256(x, 2); BROADCAST_LOAD_B_256(x, 3); + + MATMUL_256(0, 0); + MATMUL_256(0, 1); + MATMUL_256(0, 2); + MATMUL_256(0, 3); + } + STORE_256(0, 0); + STORE_256(0, 1); + STORE_256(0, 2); + STORE_256(0, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_256(0, x); + BROADCAST_LOAD_B_256(x, 0); BROADCAST_LOAD_B_256(x, 1); + MATMUL_256(0, 0); + MATMUL_256(0, 1); + } + STORE_256(0, 0); + STORE_256(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_256(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_256(0, x); + BROADCAST_LOAD_B_256(x, 0); + MATMUL_256(0, 0); + } + STORE_256(0, 0); + } + } + __m128 alpha_128 = _mm_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m128 beta_128 = _mm_broadcastss_ps(_mm_load_ss(&beta)); +#endif + for (; i < m4; i += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + DECLARE_RESULT_128(0, 2); + DECLARE_RESULT_128(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_128(0, x); + BROADCAST_LOAD_B_128(x, 0); BROADCAST_LOAD_B_128(x, 1); + BROADCAST_LOAD_B_128(x, 2); BROADCAST_LOAD_B_128(x, 3); + + MATMUL_128(0, 0); + MATMUL_128(0, 1); + MATMUL_128(0, 2); + MATMUL_128(0, 3); + } + STORE_128(0, 0); + STORE_128(0, 1); + STORE_128(0, 2); + STORE_128(0, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_128(0, x); + BROADCAST_LOAD_B_128(x, 0); BROADCAST_LOAD_B_128(x, 1); + MATMUL_128(0, 0); + MATMUL_128(0, 1); + } + STORE_128(0, 0); + STORE_128(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_128(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_128(0, x); + BROADCAST_LOAD_B_128(x, 0); + MATMUL_128(0, 0); + } + STORE_128(0, 0); + } + } + for (; i < m2; i += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0); + DECLARE_RESULT_S(0, 1); DECLARE_RESULT_S(1, 1); + DECLARE_RESULT_S(0, 2); DECLARE_RESULT_S(1, 2); + DECLARE_RESULT_S(0, 3); DECLARE_RESULT_S(1, 3); + for (k = 0; k < K; k++) { + LOAD_A_S(0, x); LOAD_A_S(1, x); + BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); + BROADCAST_LOAD_B_S(x, 2); BROADCAST_LOAD_B_S(x, 3); + + MATMUL_S(0, 0); MATMUL_S(1, 0); + MATMUL_S(0, 1); MATMUL_S(1, 1); + MATMUL_S(0, 2); MATMUL_S(1, 2); + MATMUL_S(0, 3); MATMUL_S(1, 3); + } + STORE_S(0, 0); STORE_S(1, 0); + STORE_S(0, 1); STORE_S(1, 1); + STORE_S(0, 2); STORE_S(1, 2); + STORE_S(0, 3); STORE_S(1, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0); + DECLARE_RESULT_S(0, 1); DECLARE_RESULT_S(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_S(0, x); LOAD_A_S(1, x); + BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); + MATMUL_S(0, 0); MATMUL_S(1, 0); + MATMUL_S(0, 1); MATMUL_S(1, 1); + } + STORE_S(0, 0); STORE_S(1, 0); + STORE_S(0, 1); STORE_S(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_S(0, x); LOAD_A_S(1, x); + BROADCAST_LOAD_B_S(x, 0); + MATMUL_S(0, 0); MATMUL_S(1, 0); + } + STORE_S(0, 0); STORE_S(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_S(0, 0); + DECLARE_RESULT_S(0, 1); + DECLARE_RESULT_S(0, 2); + DECLARE_RESULT_S(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_S(0, x); + BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); + BROADCAST_LOAD_B_S(x, 2); BROADCAST_LOAD_B_S(x, 3); + + MATMUL_S(0, 0); + MATMUL_S(0, 1); + MATMUL_S(0, 2); + MATMUL_S(0, 3); + } + STORE_S(0, 0); + STORE_S(0, 1); + STORE_S(0, 2); + STORE_S(0, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_S(0, 0); + DECLARE_RESULT_S(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_S(0, x); + BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); + MATMUL_S(0, 0); + MATMUL_S(0, 1); + } + STORE_S(0, 0); + STORE_S(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_S(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_S(0, x); LOAD_A_S(1, x); + BROADCAST_LOAD_B_S(x, 0); + MATMUL_S(0, 0); + } + STORE_S(0, 0); + } + } +} From f88470323bdb72a1e3ac54717606810699319d3b Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Sat, 8 May 2021 15:59:14 +0000 Subject: [PATCH 029/143] Optimize M < 16 using AVX512 mask --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index f2c79873e..f0b6d63a6 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -31,17 +31,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() #define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)]) +#define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)]) #define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) #define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) #if defined(B0) #define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) +#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) #else #define STORE_512(M, N) \ BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \ result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512)); \ _mm512_storeu_ps(&C[offset##M##N], result##M##N) +#define MASK_STORE_512(M, N) \ + BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \ + result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + asm("vfmadd231ps (%1, %2, 4), %3, %0 %{%4%}": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_ps(&C[offset##M##N], mask, result##M##N) #endif #define DECLARE_RESULT_256(M, N) __m256 result##M##N = _mm256_setzero_ps() @@ -241,6 +249,51 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp STORE_512(0, 0); } } + if (M - i > 0) { + register __mmask16 mask asm("k1") = (1UL << (M - i)) - 1; + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_STORE_512(0, 0); + } + return; + } __m256 alpha_256 = _mm256_broadcastss_ps(_mm_load_ss(&alpha)); #if !defined(B0) __m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta)); From 49b61a3f3027e24f19e78e573e50c86432aec574 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 11 May 2021 10:24:10 +0000 Subject: [PATCH 030/143] Small Matrix: skylakex: sgemm_nn: optimize for M <= 8 --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 302 +++++++++++++++++- 1 file changed, 301 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index f0b6d63a6..ae4a9daa3 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" #include +#include #define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() #define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)]) @@ -52,6 +53,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm512_mask_storeu_ps(&C[offset##M##N], mask, result##M##N) #endif +#define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&mbuf[(mi + M)*K + k]); +#define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k]) +#define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &mbuf[(mi + M)*K + k]) +#define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k]) +#if defined(B0) +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N); +#else +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M]; +#endif + + + #define DECLARE_RESULT_256(M, N) __m256 result##M##N = _mm256_setzero_ps() #define LOAD_A_256(M, N) __m256 Aval##M = _mm256_loadu_ps(&A[lda * k + i + (M*8)]) #define BROADCAST_LOAD_B_256(M, N) __m256 Bval##N = _mm256_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) @@ -249,7 +262,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp STORE_512(0, 0); } } - if (M - i > 0) { + if (M - i > 8) { register __mmask16 mask asm("k1") = (1UL << (M - i)) - 1; for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); @@ -294,6 +307,293 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return; } + int mm = M - i; + if (mm) { + FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K); + __mmask8 mask8 = (1UL << mm) - 1; + __mmask16 mask; + BLASLONG k16 = K & ~15; + BLASLONG k8 = K & ~7; + for (k = 0; k < k8; k += 8) { + __m256 r0, r1, r2, r3, r4, r5, r6, r7; + __m256 t0, t1, t2, t3, t4, t5, t6, t7; + r0 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(0 + k)]); + r1 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(1 + k)]); + r2 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(2 + k)]); + r3 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(3 + k)]); + r4 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(4 + k)]); + r5 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(5 + k)]); + r6 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(6 + k)]); + r7 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(7 + k)]); + + t0 = _mm256_unpacklo_ps(r0, r1); + t1 = _mm256_unpackhi_ps(r0, r1); + t2 = _mm256_unpacklo_ps(r2, r3); + t3 = _mm256_unpackhi_ps(r2, r3); + t4 = _mm256_unpacklo_ps(r4, r5); + t5 = _mm256_unpackhi_ps(r4, r5); + t6 = _mm256_unpacklo_ps(r6, r7); + t7 = _mm256_unpackhi_ps(r6, r7); + + r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0)); + r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2)); + r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)); + r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)); + r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0)); + r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2)); + r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); + r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); + + t0 = _mm256_permute2f128_ps(r0, r4, 0x20); + t1 = _mm256_permute2f128_ps(r1, r5, 0x20); + t2 = _mm256_permute2f128_ps(r2, r6, 0x20); + t3 = _mm256_permute2f128_ps(r3, r7, 0x20); + t4 = _mm256_permute2f128_ps(r0, r4, 0x31); + t5 = _mm256_permute2f128_ps(r1, r5, 0x31); + t6 = _mm256_permute2f128_ps(r2, r6, 0x31); + t7 = _mm256_permute2f128_ps(r3, r7, 0x31); + + switch (mm) { + case 8: _mm256_storeu_ps(&mbuf[k + 7*K], t7); + case 7: _mm256_storeu_ps(&mbuf[k + 6*K], t6); + case 6: _mm256_storeu_ps(&mbuf[k + 5*K], t5); + case 5: _mm256_storeu_ps(&mbuf[k + 4*K], t4); + case 4: _mm256_storeu_ps(&mbuf[k + 3*K], t3); + case 3: _mm256_storeu_ps(&mbuf[k + 2*K], t2); + case 2: _mm256_storeu_ps(&mbuf[k + 1*K], t1); + case 1: _mm256_storeu_ps(&mbuf[k + 0*K], t0); + } + } + for (; k < K; k++) { + for (int ii = 0; ii < mm; ii++) { + mbuf[k + ii*K] = A[i + lda*k + ii]; + } + } + int mi = 0; + for (; i < m4; i += 4, mi += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); STORE_REDUCE(2, 1); STORE_REDUCE(3, 1); + STORE_REDUCE(0, 2); STORE_REDUCE(1, 2); STORE_REDUCE(2, 2); STORE_REDUCE(3, 2); + STORE_REDUCE(0, 3); STORE_REDUCE(1, 3); STORE_REDUCE(2, 3); STORE_REDUCE(3, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); STORE_REDUCE(2, 1); STORE_REDUCE(3, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0); + } + + } + for (; i < m2; i += 2, mi += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + STORE_REDUCE(0, 2); STORE_REDUCE(1, 2); + STORE_REDUCE(0, 3); STORE_REDUCE(1, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + } + } + for (; i < M; i += 1, mi += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + STORE_REDUCE(0, 2); + STORE_REDUCE(0, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + STORE_REDUCE(0, 0); + } + } + free(mbuf); + return; + } __m256 alpha_256 = _mm256_broadcastss_ps(_mm_load_ss(&alpha)); #if !defined(B0) __m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta)); From 3d8c6d9607c82a999ad8661834d0d78605a5f321 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 11 May 2021 10:33:07 +0000 Subject: [PATCH 031/143] Small Matrix: skylakex: sgemm nn: clean up unused code --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 222 ------------------ 1 file changed, 222 deletions(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index ae4a9daa3..a5c530593 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -63,48 +63,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M]; #endif - - -#define DECLARE_RESULT_256(M, N) __m256 result##M##N = _mm256_setzero_ps() -#define LOAD_A_256(M, N) __m256 Aval##M = _mm256_loadu_ps(&A[lda * k + i + (M*8)]) -#define BROADCAST_LOAD_B_256(M, N) __m256 Bval##N = _mm256_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) -#define MATMUL_256(M, N) result##M##N = _mm256_fmadd_ps(Aval##M, Bval##N, result##M##N) -#if defined(B0) -#define STORE_256(M, N) result##M##N = _mm256_mul_ps(result##M##N, alpha_256); \ - _mm256_storeu_ps(&C[(j+N)*ldc + i + (M*8)], result##M##N) -#else -#define STORE_256(M, N) \ - BLASLONG offset##M##N = (j+N)*ldc + i + (M*8); \ - result##M##N = _mm256_mul_ps(result##M##N, alpha_256); \ - asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_256)); \ - _mm256_storeu_ps(&C[offset##M##N], result##M##N) -#endif - -#define DECLARE_RESULT_128(M, N) __m128 result##M##N; asm("vpxorq %0, %0, %0": "+v"(result##M##N):) -#define LOAD_A_128(M, N) __m128 Aval##M = _mm_maskz_loadu_ps(mask, &A[lda * k + i + (M*4)]) -#define BROADCAST_LOAD_B_128(M, N) __m128 Bval##N = _mm_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) -#define MATMUL_128(M, N) result##M##N = _mm_fmadd_ps(Aval##M, Bval##N, result##M##N) -#if defined(B0) -#define STORE_128(M, N) result##M##N = _mm_maskz_mul_ps(mask, result##M##N, alpha_128); \ - _mm_mask_storeu_ps(&C[(j+N)*ldc + i + (M*4)], mask, result##M##N) -#else -#define STORE_128(M, N) \ - BLASLONG offset##M##N = (j+N)*ldc + i + (M*4); \ - result##M##N = _mm_maskz_mul_ps(mask, result##M##N, alpha_128); \ - asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_128)); \ - _mm_mask_storeu_ps(&C[offset##M##N], mask, result##M##N) -#endif - -#define DECLARE_RESULT_S(M, N) float result##M##N = 0; -#define LOAD_A_S(M, N) float Aval##M = A[lda * k + i + M] -#define BROADCAST_LOAD_B_S(M, N) float Bval##N = B[k + ldb * (j+N)] -#define MATMUL_S(M, N) result##M##N += Aval##M * Bval##N -#if defined(B0) -#define STORE_S(M, N) C[(j+N)*ldc + i + M] = result##M##N * alpha -#else -#define STORE_S(M, N) C[(j+N)*ldc + i + M] = result##M##N * alpha + C[(j+N)*ldc + i + M] * beta -#endif - #if defined(B0) int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) #else @@ -594,184 +552,4 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp free(mbuf); return; } - __m256 alpha_256 = _mm256_broadcastss_ps(_mm_load_ss(&alpha)); -#if !defined(B0) - __m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta)); -#endif - for (; i < m8; i += 8) { - for (j = 0; j < n4; j += 4) { - DECLARE_RESULT_256(0, 0); - DECLARE_RESULT_256(0, 1); - DECLARE_RESULT_256(0, 2); - DECLARE_RESULT_256(0, 3); - for (k = 0; k < K; k++) { - LOAD_A_256(0, x); - BROADCAST_LOAD_B_256(x, 0); BROADCAST_LOAD_B_256(x, 1); - BROADCAST_LOAD_B_256(x, 2); BROADCAST_LOAD_B_256(x, 3); - - MATMUL_256(0, 0); - MATMUL_256(0, 1); - MATMUL_256(0, 2); - MATMUL_256(0, 3); - } - STORE_256(0, 0); - STORE_256(0, 1); - STORE_256(0, 2); - STORE_256(0, 3); - } - for (; j < n2; j += 2) { - DECLARE_RESULT_256(0, 0); - DECLARE_RESULT_256(0, 1); - for (k = 0; k < K; k++) { - LOAD_A_256(0, x); - BROADCAST_LOAD_B_256(x, 0); BROADCAST_LOAD_B_256(x, 1); - MATMUL_256(0, 0); - MATMUL_256(0, 1); - } - STORE_256(0, 0); - STORE_256(0, 1); - } - for (; j < N; j++) { - DECLARE_RESULT_256(0, 0); - for (k = 0; k < K; k++) { - LOAD_A_256(0, x); - BROADCAST_LOAD_B_256(x, 0); - MATMUL_256(0, 0); - } - STORE_256(0, 0); - } - } - __m128 alpha_128 = _mm_broadcastss_ps(_mm_load_ss(&alpha)); -#if !defined(B0) - __m128 beta_128 = _mm_broadcastss_ps(_mm_load_ss(&beta)); -#endif - for (; i < m4; i += 4) { - for (j = 0; j < n4; j += 4) { - DECLARE_RESULT_128(0, 0); - DECLARE_RESULT_128(0, 1); - DECLARE_RESULT_128(0, 2); - DECLARE_RESULT_128(0, 3); - for (k = 0; k < K; k++) { - LOAD_A_128(0, x); - BROADCAST_LOAD_B_128(x, 0); BROADCAST_LOAD_B_128(x, 1); - BROADCAST_LOAD_B_128(x, 2); BROADCAST_LOAD_B_128(x, 3); - - MATMUL_128(0, 0); - MATMUL_128(0, 1); - MATMUL_128(0, 2); - MATMUL_128(0, 3); - } - STORE_128(0, 0); - STORE_128(0, 1); - STORE_128(0, 2); - STORE_128(0, 3); - } - for (; j < n2; j += 2) { - DECLARE_RESULT_128(0, 0); - DECLARE_RESULT_128(0, 1); - for (k = 0; k < K; k++) { - LOAD_A_128(0, x); - BROADCAST_LOAD_B_128(x, 0); BROADCAST_LOAD_B_128(x, 1); - MATMUL_128(0, 0); - MATMUL_128(0, 1); - } - STORE_128(0, 0); - STORE_128(0, 1); - } - for (; j < N; j++) { - DECLARE_RESULT_128(0, 0); - for (k = 0; k < K; k++) { - LOAD_A_128(0, x); - BROADCAST_LOAD_B_128(x, 0); - MATMUL_128(0, 0); - } - STORE_128(0, 0); - } - } - for (; i < m2; i += 2) { - for (j = 0; j < n4; j += 4) { - DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0); - DECLARE_RESULT_S(0, 1); DECLARE_RESULT_S(1, 1); - DECLARE_RESULT_S(0, 2); DECLARE_RESULT_S(1, 2); - DECLARE_RESULT_S(0, 3); DECLARE_RESULT_S(1, 3); - for (k = 0; k < K; k++) { - LOAD_A_S(0, x); LOAD_A_S(1, x); - BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); - BROADCAST_LOAD_B_S(x, 2); BROADCAST_LOAD_B_S(x, 3); - - MATMUL_S(0, 0); MATMUL_S(1, 0); - MATMUL_S(0, 1); MATMUL_S(1, 1); - MATMUL_S(0, 2); MATMUL_S(1, 2); - MATMUL_S(0, 3); MATMUL_S(1, 3); - } - STORE_S(0, 0); STORE_S(1, 0); - STORE_S(0, 1); STORE_S(1, 1); - STORE_S(0, 2); STORE_S(1, 2); - STORE_S(0, 3); STORE_S(1, 3); - } - for (; j < n2; j += 2) { - DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0); - DECLARE_RESULT_S(0, 1); DECLARE_RESULT_S(1, 1); - for (k = 0; k < K; k++) { - LOAD_A_S(0, x); LOAD_A_S(1, x); - BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); - MATMUL_S(0, 0); MATMUL_S(1, 0); - MATMUL_S(0, 1); MATMUL_S(1, 1); - } - STORE_S(0, 0); STORE_S(1, 0); - STORE_S(0, 1); STORE_S(1, 1); - } - for (; j < N; j++) { - DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0); - for (k = 0; k < K; k++) { - LOAD_A_S(0, x); LOAD_A_S(1, x); - BROADCAST_LOAD_B_S(x, 0); - MATMUL_S(0, 0); MATMUL_S(1, 0); - } - STORE_S(0, 0); STORE_S(1, 0); - } - } - for (; i < M; i += 1) { - for (j = 0; j < n4; j += 4) { - DECLARE_RESULT_S(0, 0); - DECLARE_RESULT_S(0, 1); - DECLARE_RESULT_S(0, 2); - DECLARE_RESULT_S(0, 3); - for (k = 0; k < K; k++) { - LOAD_A_S(0, x); - BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); - BROADCAST_LOAD_B_S(x, 2); BROADCAST_LOAD_B_S(x, 3); - - MATMUL_S(0, 0); - MATMUL_S(0, 1); - MATMUL_S(0, 2); - MATMUL_S(0, 3); - } - STORE_S(0, 0); - STORE_S(0, 1); - STORE_S(0, 2); - STORE_S(0, 3); - } - for (; j < n2; j += 2) { - DECLARE_RESULT_S(0, 0); - DECLARE_RESULT_S(0, 1); - for (k = 0; k < K; k++) { - LOAD_A_S(0, x); - BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); - MATMUL_S(0, 0); - MATMUL_S(0, 1); - } - STORE_S(0, 0); - STORE_S(0, 1); - } - for (; j < N; j++) { - DECLARE_RESULT_S(0, 0); - for (k = 0; k < K; k++) { - LOAD_A_S(0, x); LOAD_A_S(1, x); - BROADCAST_LOAD_B_S(x, 0); - MATMUL_S(0, 0); - } - STORE_S(0, 0); - } - } } From 13b32f69b78b15e7d95978011ea6c2bb3d9e3642 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 12 May 2021 17:08:18 +0000 Subject: [PATCH 032/143] Small Matrix: skylakex: sgemm nn: reduce store 4 M at a time --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 64 ++++++++++++++----- 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index a5c530593..be9f085c0 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -57,10 +57,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k]) #define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &mbuf[(mi + M)*K + k]) #define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k]) +#define REDUCE_M4(N) \ + __m512 r0, r1, r2, r3, t0, t1, t2, t3;\ + r0 = _mm512_unpacklo_ps(result0##N, result1##N); r1 = _mm512_unpackhi_ps(result0##N, result1##N); \ + r2 = _mm512_unpacklo_ps(result2##N, result3##N); r3 = _mm512_unpackhi_ps(result2##N, result3##N); \ + t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \ + t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \ + r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \ + __m128 s0, s1, s2, s3; \ + s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \ + s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \ + s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0); #if defined(B0) #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N); +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \ +} #else #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M]; +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \ + _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \ +} #endif #if defined(B0) @@ -75,14 +95,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp BLASLONG m64 = M & ~63; BLASLONG m32 = M & ~31; BLASLONG m16 = M & ~15; - BLASLONG m8 = M & ~7; BLASLONG m4 = M & ~3; BLASLONG m2 = M & ~1; BLASLONG n4 = N & ~3; BLASLONG n2 = N & ~1; - __mmask8 mask = 0xff; // just use to avoid SSE instruction __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); #if !defined(B0) @@ -220,8 +238,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp STORE_512(0, 0); } } - if (M - i > 8) { - register __mmask16 mask asm("k1") = (1UL << (M - i)) - 1; + int mm = M - i; + if (!mm) return 0; + if (mm > 8 || K < 32) { + register __mmask16 mask asm("k1") = (1UL << mm) - 1; for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); @@ -263,10 +283,20 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } MASK_STORE_512(0, 0); } - return; - } - int mm = M - i; - if (mm) { + } else { + /* M => [1, 8] + * + * This kernel use dot-like style to calc a value - C(x, y): + * C(x, y) = A(x, 0)*B(0, y) + A(x, 1)*B(1, y) +....+ A(x, K)*B(K, y) + * + * Alloc a buf to copy rest of A as row major, + * so memory access from 0 to K is continuous for both A & B. + * + * Loading to zmm and FMA 16 of k at one loop, + * finally reduce_add zmm to a single float result in C(x, y). + * + * Note: performance is bad when K is small. + */ FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K); __mmask8 mask8 = (1UL << mm) - 1; __mmask16 mask; @@ -328,6 +358,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } } int mi = 0; + mask8 = 0xff; // just use to avoid SSE instruction + __m128 alpha_128 = _mm_broadcast_ss(&alpha); +#if !defined(B0) + __m128 beta_128 = _mm_broadcast_ss(&beta); +#endif for (; i < m4; i += 4, mi += 4) { for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); @@ -354,10 +389,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); } - STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0); - STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); STORE_REDUCE(2, 1); STORE_REDUCE(3, 1); - STORE_REDUCE(0, 2); STORE_REDUCE(1, 2); STORE_REDUCE(2, 2); STORE_REDUCE(3, 2); - STORE_REDUCE(0, 3); STORE_REDUCE(1, 3); STORE_REDUCE(2, 3); STORE_REDUCE(3, 3); + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3); } for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); @@ -378,9 +410,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); } - STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0); - STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); STORE_REDUCE(2, 1); STORE_REDUCE(3, 1); - + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); } for (; j < N; j += 1) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); @@ -398,7 +428,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); } - STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0); + STORE_REDUCE_M4(0); } } @@ -550,6 +580,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } } free(mbuf); - return; } + return 0; } From 4c9d9940fdd6a458289a02e850afd65d5b9689ba Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 13 May 2021 09:41:51 +0000 Subject: [PATCH 033/143] Small Matrix: skylakex: sgemm nn: reduce store 4 N at a time --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index be9f085c0..c9f43f9a2 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -57,10 +57,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k]) #define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &mbuf[(mi + M)*K + k]) #define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k]) -#define REDUCE_M4(N) \ +#define REDUCE_4(rr0, rr1, rr2, rr3) \ __m512 r0, r1, r2, r3, t0, t1, t2, t3;\ - r0 = _mm512_unpacklo_ps(result0##N, result1##N); r1 = _mm512_unpackhi_ps(result0##N, result1##N); \ - r2 = _mm512_unpacklo_ps(result2##N, result3##N); r3 = _mm512_unpackhi_ps(result2##N, result3##N); \ + r0 = _mm512_unpacklo_ps(rr0, rr1); r1 = _mm512_unpackhi_ps(rr0, rr1); \ + r2 = _mm512_unpacklo_ps(rr2, rr3); r3 = _mm512_unpackhi_ps(rr2, rr3); \ t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \ t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \ r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \ @@ -68,12 +68,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \ s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \ s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0); +#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N) +#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3) #if defined(B0) #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N); #define STORE_REDUCE_M4(N) {\ REDUCE_M4(N) \ _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \ } +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); \ +} #else #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M]; #define STORE_REDUCE_M4(N) {\ @@ -81,6 +87,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \ _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \ } +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + s1 = _mm_i32gather_ps(&C[j*ldc + i + M], vindex_n, 4); \ + s0 = _mm_fmadd_ps(s1, beta_128, s0); \ + _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); \ +} #endif #if defined(B0) @@ -363,6 +375,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp #if !defined(B0) __m128 beta_128 = _mm_broadcast_ss(&beta); #endif + __m128i vindex_n = _mm_set_epi32(ldc*3, ldc*2, ldc, 0); for (; i < m4; i += 4, mi += 4) { for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); @@ -458,10 +471,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(0, 3); MATMUL_512(1, 3); } - STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); - STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); - STORE_REDUCE(0, 2); STORE_REDUCE(1, 2); - STORE_REDUCE(0, 3); STORE_REDUCE(1, 3); + STORE_REDUCE_N4(0); STORE_REDUCE_N4(1); } for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); @@ -532,10 +542,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp MATMUL_512(0, 2); MATMUL_512(0, 3); } - STORE_REDUCE(0, 0); - STORE_REDUCE(0, 1); - STORE_REDUCE(0, 2); - STORE_REDUCE(0, 3); + STORE_REDUCE_N4(0); } for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); From a87736346fd3988618c0d8895827566fce5a5487 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 13 May 2021 10:16:54 +0000 Subject: [PATCH 034/143] Small Matrix: skylakex: sgemm nn: add n6 to improve performance --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 90 ++++++++++++++++++- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index c9f43f9a2..a67541161 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -110,6 +110,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp BLASLONG m4 = M & ~3; BLASLONG m2 = M & ~1; + BLASLONG n6 = N - (N % 6); BLASLONG n4 = N & ~3; BLASLONG n2 = N & ~1; @@ -165,7 +166,34 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } } for (; i < m32; i += 32) { - for (j = 0; j < n4; j += 4) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + MATMUL_512(0, 4); MATMUL_512(1, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + STORE_512(0, 4); STORE_512(1, 4); + STORE_512(0, 5); STORE_512(1, 5); + } + for (;j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); @@ -208,7 +236,34 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } } for (; i < m16; i += 16) { - for (j = 0; j < n4; j += 4) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + STORE_512(0, 4); + STORE_512(0, 5); + } + for (; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(0, 2); @@ -228,6 +283,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp STORE_512(0, 2); STORE_512(0, 3); } + for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); @@ -254,26 +310,54 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp if (!mm) return 0; if (mm > 8 || K < 32) { register __mmask16 mask asm("k1") = (1UL << mm) - 1; - for (j = 0; j < n4; j += 4) { + for (j = 0; j < n6; j += 6) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); for (k = 0; k < K; k++) { MASK_LOAD_A_512(0, x); BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); MATMUL_512(0, 0); MATMUL_512(0, 1); MATMUL_512(0, 2); MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); } MASK_STORE_512(0, 0); MASK_STORE_512(0, 1); MASK_STORE_512(0, 2); MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); + MASK_STORE_512(0, 5); } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + } + for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); From 9967e61abb3ba0b87a043662382c515ed9d220bb Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 19 May 2021 10:50:03 +0000 Subject: [PATCH 035/143] Small Matrix: skylakex: sgemm nn: fix error when beta not zero --- kernel/x86_64/sgemm_small_kernel_nn_skylakex.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index a67541161..99856d0af 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -42,15 +42,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) #else #define STORE_512(M, N) \ - BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \ result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ - asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512)); \ - _mm512_storeu_ps(&C[offset##M##N], result##M##N) + asm("vfmadd231ps (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512)); \ + _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) #define MASK_STORE_512(M, N) \ - BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \ result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ - asm("vfmadd231ps (%1, %2, 4), %3, %0 %{%4%}": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512), "k"(mask)); \ - _mm512_mask_storeu_ps(&C[offset##M##N], mask, result##M##N) + asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) #endif #define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&mbuf[(mi + M)*K + k]); From ca7682e3a3dceeb52ba1ad554f384388ffb24c9a Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 20 May 2021 11:24:31 +0000 Subject: [PATCH 036/143] Small Matrix: skylakex: sgemm nn: fix n6 conflicts with n4 --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 62 ------------------- 1 file changed, 62 deletions(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index 99856d0af..9bc7a7c58 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -191,26 +191,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp STORE_512(0, 4); STORE_512(1, 4); STORE_512(0, 5); STORE_512(1, 5); } - for (;j < n4; j += 4) { - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); - DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); - DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); - DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); - for (k = 0; k < K; k++) { - LOAD_A_512(0, x); LOAD_A_512(1, x); - BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); - BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); - - MATMUL_512(0, 0); MATMUL_512(1, 0); - MATMUL_512(0, 1); MATMUL_512(1, 1); - MATMUL_512(0, 2); MATMUL_512(1, 2); - MATMUL_512(0, 3); MATMUL_512(1, 3); - } - STORE_512(0, 0); STORE_512(1, 0); - STORE_512(0, 1); STORE_512(1, 1); - STORE_512(0, 2); STORE_512(1, 2); - STORE_512(0, 3); STORE_512(1, 3); - } for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); @@ -261,27 +241,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp STORE_512(0, 4); STORE_512(0, 5); } - for (; j < n4; j += 4) { - DECLARE_RESULT_512(0, 0); - DECLARE_RESULT_512(0, 1); - DECLARE_RESULT_512(0, 2); - DECLARE_RESULT_512(0, 3); - for (k = 0; k < K; k++) { - LOAD_A_512(0, x); - BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); - BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); - - MATMUL_512(0, 0); - MATMUL_512(0, 1); - MATMUL_512(0, 2); - MATMUL_512(0, 3); - } - STORE_512(0, 0); - STORE_512(0, 1); - STORE_512(0, 2); - STORE_512(0, 3); - } - for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); @@ -335,27 +294,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp MASK_STORE_512(0, 4); MASK_STORE_512(0, 5); } - for (; j < n4; j += 4) { - DECLARE_RESULT_512(0, 0); - DECLARE_RESULT_512(0, 1); - DECLARE_RESULT_512(0, 2); - DECLARE_RESULT_512(0, 3); - for (k = 0; k < K; k++) { - MASK_LOAD_A_512(0, x); - BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); - BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); - - MATMUL_512(0, 0); - MATMUL_512(0, 1); - MATMUL_512(0, 2); - MATMUL_512(0, 3); - } - MASK_STORE_512(0, 0); - MASK_STORE_512(0, 1); - MASK_STORE_512(0, 2); - MASK_STORE_512(0, 3); - } - for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); From 0d72d75bf9455c91b6f0c4ecf5b7555845dccf6f Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 20 May 2021 11:47:10 +0000 Subject: [PATCH 037/143] Small Matrix: skylakex: add sgemm nt kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../sgemm_small_kernel_b0_nt_skylakex.c | 2 + .../x86_64/sgemm_small_kernel_nt_skylakex.c | 366 ++++++++++++++++++ 3 files changed, 370 insertions(+) create mode 100644 kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c create mode 100644 kernel/x86_64/sgemm_small_kernel_nt_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 1a2e67b52..d3560bf80 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -12,6 +12,8 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c +SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c +SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_b0_nt_skylakex.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c diff --git a/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c new file mode 100644 index 000000000..6d7934be1 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sgemm_small_kernel_nt_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c new file mode 100644 index 000000000..3fc842669 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c @@ -0,0 +1,366 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() +#define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)]) +#define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)]) +#define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[ldb * k + j + N])) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) +#if defined(B0) +#define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) +#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) +#else +#define STORE_512(M, N) \ + result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + asm("vfmadd231ps (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512)); \ + _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) +#define MASK_STORE_512(M, N) \ + result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) +#endif + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m64 = M & ~63; + BLASLONG m32 = M & ~31; + BLASLONG m16 = M & ~15; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n6 = N - (N % 6); + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); +#endif + + for (i = 0; i < m64; i += 64) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); DECLARE_RESULT_512(2, 4); DECLARE_RESULT_512(3, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); DECLARE_RESULT_512(2, 5); DECLARE_RESULT_512(3, 5); + + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + MATMUL_512(0, 4); MATMUL_512(1, 4); MATMUL_512(2, 4); MATMUL_512(3, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); MATMUL_512(2, 5); MATMUL_512(3, 5); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + STORE_512(0, 4); STORE_512(1, 4); STORE_512(2, 4); STORE_512(3, 4); + STORE_512(0, 5); STORE_512(1, 5); STORE_512(2, 5); STORE_512(3, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + } + for (; i < m32; i += 32) { + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); + DECLARE_RESULT_512(0, 6); DECLARE_RESULT_512(1, 6); + DECLARE_RESULT_512(0, 7); DECLARE_RESULT_512(1, 7); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + MATMUL_512(0, 4); MATMUL_512(1, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); + MATMUL_512(0, 6); MATMUL_512(1, 6); + MATMUL_512(0, 7); MATMUL_512(1, 7); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + STORE_512(0, 4); STORE_512(1, 4); + STORE_512(0, 5); STORE_512(1, 5); + STORE_512(0, 6); STORE_512(1, 6); + STORE_512(0, 7); STORE_512(1, 7); + } + for (;j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + } + for (; i < m16; i += 16) { + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + DECLARE_RESULT_512(0, 6); + DECLARE_RESULT_512(0, 7); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + MATMUL_512(0, 6); + MATMUL_512(0, 7); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + STORE_512(0, 4); + STORE_512(0, 5); + STORE_512(0, 6); + STORE_512(0, 7); + } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + } + int mm = M - i; + if (mm > 0) { + register __mmask16 mask asm("k1") = (1UL << mm) - 1; + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + DECLARE_RESULT_512(0, 6); + DECLARE_RESULT_512(0, 7); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + MATMUL_512(0, 6); + MATMUL_512(0, 7); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); + MASK_STORE_512(0, 5); + MASK_STORE_512(0, 6); + MASK_STORE_512(0, 7); + } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + } + + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_STORE_512(0, 0); + } + } +} From ae3f5c737c24e6fdb7de4559969bee5631aa1683 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 21 May 2021 13:31:31 +0000 Subject: [PATCH 038/143] Small Matrix: skylakex: sgemm nt: optimize for M < 12 --- .../x86_64/sgemm_small_kernel_nt_skylakex.c | 171 +++++++++++++++++- 1 file changed, 170 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c index 3fc842669..f293bf9f9 100644 --- a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c @@ -35,11 +35,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)]) #define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[ldb * k + j + N])) #define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) + +#define BROADCAST_LOAD_A_512(M, N) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[lda * k + i + M])) +#define LOAD_B_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[ldb * k + j + (N*16)]) +#define MASK_LOAD_B_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[ldb * k + j + (N*16)]) #if defined(B0) #define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) #define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4) #else #define STORE_512(M, N) \ result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ @@ -49,6 +57,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \ _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + __m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ + result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \ + _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + __m512 tmp##M##N = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask, vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ + result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \ + _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4); #endif #if defined(B0) @@ -66,6 +82,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp BLASLONG m4 = M & ~3; BLASLONG m2 = M & ~1; + BLASLONG n64 = N & ~63; + BLASLONG n32 = N & ~31; BLASLONG n8 = N & ~7; BLASLONG n6 = N - (N % 6); BLASLONG n4 = N & ~3; @@ -284,7 +302,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } } int mm = M - i; - if (mm > 0) { + if (mm >= 12) { register __mmask16 mask asm("k1") = (1UL << mm) - 1; for (j = 0; j < n8; j += 8) { DECLARE_RESULT_512(0, 0); @@ -362,5 +380,156 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } MASK_STORE_512(0, 0); } + } else if (mm > 0) { + int index_n[16]; + for (int ii = 0; ii < 16; ii++) { + index_n[ii] = ii * ldc; + } + __m512i vindex_n = _mm512_loadu_epi32(index_n); + for (; i < m4; i += 4) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); SCATTER_STORE_512(2, 2); SCATTER_STORE_512(3, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); SCATTER_STORE_512(2, 3); SCATTER_STORE_512(3, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1); + } + __mmask16 mask = 0xffff; + for (; j < N; j += 16) { + int remains = N - j; + if (remains < 16) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); MASK_SCATTER_STORE_512(2, 0); MASK_SCATTER_STORE_512(3, 0); + } + } + for (; i < m2; i += 2) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + } + __mmask16 mask = 0xffff; + for (; j < N; j += 16) { + int remains = N - j; + if (remains < 16) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + SCATTER_STORE_512(0, 2); + SCATTER_STORE_512(0, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + } + __mmask16 mask = 0xffff; + for (; j < N; j += 16) { + int remains = N - j; + if (remains < 16) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_SCATTER_STORE_512(0, 0); + } + } } + return 0; } From 642c3938790b45606dea7450a6fbc23b6c9b9b9c Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 26 May 2021 16:30:57 +0000 Subject: [PATCH 039/143] Small Matrix: skylakex: add sgemm tn kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../sgemm_small_kernel_b0_tn_skylakex.c | 2 + .../x86_64/sgemm_small_kernel_tn_skylakex.c | 316 ++++++++++++++++++ 3 files changed, 320 insertions(+) create mode 100644 kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c create mode 100644 kernel/x86_64/sgemm_small_kernel_tn_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index d3560bf80..5e0d9e5b4 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -14,6 +14,8 @@ SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_b0_nt_skylakex.c +SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_skylakex.c +SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_b0_tn_skylakex.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c diff --git a/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c new file mode 100644 index 000000000..0f9745b72 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sgemm_small_kernel_tn_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c new file mode 100644 index 000000000..5a9a4ea32 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c @@ -0,0 +1,316 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) + +#define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[(i + M)*lda + k]); +#define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k]) +#define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[(i + M)*lda + k]) +#define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k]) + +#define REDUCE_4(rr0, rr1, rr2, rr3) \ + __m512 r0, r1, r2, r3, t0, t1, t2, t3;\ + r0 = _mm512_unpacklo_ps(rr0, rr1); r1 = _mm512_unpackhi_ps(rr0, rr1); \ + r2 = _mm512_unpacklo_ps(rr2, rr3); r3 = _mm512_unpackhi_ps(rr2, rr3); \ + t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \ + t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \ + r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \ + __m128 s0, s1, s2, s3; \ + s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \ + s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \ + s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0); + +#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N) +#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3) + +#if defined(B0) +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) +#define STORE_M4(N, s0) _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); +#define STORE_N4(M, s0) _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); +#else +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M] +#define STORE_M4(N, s0) \ + asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \ + _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); + +#define STORE_N4(M, s0) \ + s0 = _mm_fmadd_ps(_mm_i32gather_ps(&C[j*ldc + i + M], vindex_n, 4), beta_128, s0); \ + _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); +#endif +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + STORE_M4(N, s0) \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + STORE_N4(M, s0) \ +} + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k16 = K & ~15; + + __mmask16 mask; + __mmask8 mask8 = 0xff; // just use to avoid SSE instruction + + __m128i vindex_n = _mm_set_epi32(ldc*3, ldc*2, ldc, 0); + __m128 alpha_128 = _mm_broadcast_ss(&alpha); +#if !defined(B0) + __m128 beta_128 = _mm_broadcast_ss(&beta); +#endif + for (i = 0; i < m4; i += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_REDUCE_M4(0); + } + + } + for (; i < m2; i += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_REDUCE_N4(0); STORE_REDUCE_N4(1); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_REDUCE_N4(0); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + STORE_REDUCE(0, 0); + } + } + return 0; +} From 5dc7c3c8e572c1760cd9aba40dde1db54bb3f2e3 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 27 May 2021 11:03:56 +0000 Subject: [PATCH 040/143] Small Matrix: add GEMM_SMALL_MATRIX_PERMIT to tune small matrics case --- common_c.h | 2 ++ common_d.h | 1 + common_level3.h | 8 +++++ common_macro.h | 10 ++++++ common_s.h | 2 ++ common_z.h | 2 ++ interface/gemm.c | 9 +++--- kernel/Makefile.L3 | 31 ++++++++++++++++++ kernel/generic/gemm_small_matrix_permit.c | 37 ++++++++++++++++++++++ kernel/generic/zgemm_small_matrix_permit.c | 37 ++++++++++++++++++++++ 10 files changed, 135 insertions(+), 4 deletions(-) create mode 100644 kernel/generic/gemm_small_matrix_permit.c create mode 100644 kernel/generic/zgemm_small_matrix_permit.c diff --git a/common_c.h b/common_c.h index 9388ece93..dc273eef0 100644 --- a/common_c.h +++ b/common_c.h @@ -232,6 +232,8 @@ #define CGEADD_K cgeadd_k +#define CGEMM_SMALL_MATRIX_PERMIT cgemm_small_matrix_permit + #define CGEMM_SMALL_KERNEL_NN cgemm_small_kernel_nn #define CGEMM_SMALL_KERNEL_NT cgemm_small_kernel_nt #define CGEMM_SMALL_KERNEL_NR cgemm_small_kernel_nr diff --git a/common_d.h b/common_d.h index 42c14e828..bb85f1232 100644 --- a/common_d.h +++ b/common_d.h @@ -157,6 +157,7 @@ #define DIMATCOPY_K_RT dimatcopy_k_rt #define DGEADD_K dgeadd_k +#define DGEMM_SMALL_MATRIX_PERMIT dgemm_small_matrix_permit #define DGEMM_SMALL_KERNEL_NN dgemm_small_kernel_nn #define DGEMM_SMALL_KERNEL_NT dgemm_small_kernel_nt diff --git a/common_level3.h b/common_level3.h index a3a487dab..187402a9a 100644 --- a/common_level3.h +++ b/common_level3.h @@ -516,11 +516,15 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xd #endif #ifdef SMALL_MATRIX_OPT +int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); + int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int dgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta); + int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); @@ -536,6 +540,8 @@ int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLA int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int cgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1); + int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); @@ -556,6 +562,8 @@ int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLON int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int zgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1); + int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); diff --git a/common_macro.h b/common_macro.h index 2cccf9b39..aeb9a205b 100644 --- a/common_macro.h +++ b/common_macro.h @@ -644,6 +644,8 @@ #define GEADD_K DGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT DGEMM_SMALL_MATRIX_PERMIT + #define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN #define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN @@ -940,6 +942,8 @@ #define GEADD_K SGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT SGEMM_SMALL_MATRIX_PERMIT + #define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN #define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN @@ -1256,6 +1260,8 @@ #define GEADD_K SGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT SGEMM_SMALL_MATRIX_PERMIT + #define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN #define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN @@ -2093,6 +2099,8 @@ #define GEADD_K ZGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT ZGEMM_SMALL_MATRIX_PERMIT + #define GEMM_SMALL_KERNEL_NN ZGEMM_SMALL_KERNEL_NN #define GEMM_SMALL_KERNEL_NT ZGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_NR ZGEMM_SMALL_KERNEL_NR @@ -2556,6 +2564,8 @@ #define GEADD_K CGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT CGEMM_SMALL_MATRIX_PERMIT + #define GEMM_SMALL_KERNEL_NN CGEMM_SMALL_KERNEL_NN #define GEMM_SMALL_KERNEL_NT CGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_NR CGEMM_SMALL_KERNEL_NR diff --git a/common_s.h b/common_s.h index 685d73062..5851014cf 100644 --- a/common_s.h +++ b/common_s.h @@ -164,6 +164,8 @@ #define SGEADD_K sgeadd_k +#define SGEMM_SMALL_MATRIX_PERMIT sgemm_small_matrix_permit + #define SGEMM_SMALL_KERNEL_NN sgemm_small_kernel_nn #define SGEMM_SMALL_KERNEL_NT sgemm_small_kernel_nt #define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn diff --git a/common_z.h b/common_z.h index 8594ec74d..6088260a1 100644 --- a/common_z.h +++ b/common_z.h @@ -232,6 +232,8 @@ #define ZGEADD_K zgeadd_k +#define ZGEMM_SMALL_MATRIX_PERMIT zgemm_small_matrix_permit + #define ZGEMM_SMALL_KERNEL_NN zgemm_small_kernel_nn #define ZGEMM_SMALL_KERNEL_NT zgemm_small_kernel_nt #define ZGEMM_SMALL_KERNEL_NR zgemm_small_kernel_nr diff --git a/interface/gemm.c b/interface/gemm.c index 7251993ee..ad8780668 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -464,25 +464,26 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif #ifdef SMALL_MATRIX_OPT - //need to tune small matrices cases. - if(MNK <= 100.0*100.0*100.0){ - #if !defined(COMPLEX) + if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){ if(*(FLOAT *)(args.beta) == 0.0){ (gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); }else{ (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); } + return; + } #else + if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, alpha[0], alpha[1], beta[0], beta[1])){ if(beta[0] == 0.0 && beta[1] == 0.0){ (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc); }else{ (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc); } -#endif return; } #endif +#endif buffer = (XFLOAT *)blas_memory_alloc(0); diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 1c4a00158..f977793a0 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -451,18 +451,21 @@ endif ifeq ($(SMALL_MATRIX_OPT), 1) SBLASOBJS += \ + sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ + dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ + cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ @@ -481,6 +484,7 @@ CBLASOBJS += \ cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ + zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ @@ -4294,6 +4298,10 @@ $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) ###### BLAS small matrix optimization ##### +ifndef DGEMM_SMALL_M_PERMIT +DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif + ifndef DGEMM_SMALL_K_NN DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c endif @@ -4310,6 +4318,9 @@ ifndef DGEMM_SMALL_K_TT DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c endif +$(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -4350,6 +4361,9 @@ $(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL $(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +ifndef SGEMM_SMALL_M_PERMIT +SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif ifndef SGEMM_SMALL_K_NN SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c @@ -4367,6 +4381,9 @@ ifndef SGEMM_SMALL_K_TT SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c endif +$(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -4407,6 +4424,9 @@ $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +ifndef CGEMM_SMALL_M_PERMIT +CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c +endif ifndef CGEMM_SMALL_K_NN CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c @@ -4424,6 +4444,9 @@ ifndef CGEMM_SMALL_K_TT CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c endif +$(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ + $(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ @@ -4536,6 +4559,10 @@ $(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL $(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ +ifndef ZGEMM_SMALL_M_PERMIT +ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c +endif + ifndef ZGEMM_SMALL_K_NN ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c endif @@ -4552,6 +4579,10 @@ ifndef ZGEMM_SMALL_K_TT ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c endif +$(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ + + $(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ diff --git a/kernel/generic/gemm_small_matrix_permit.c b/kernel/generic/gemm_small_matrix_permit.c new file mode 100644 index 000000000..6e1ab1fc1 --- /dev/null +++ b/kernel/generic/gemm_small_matrix_permit.c @@ -0,0 +1,37 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + if (MNK <= 100.0*100.0*100.0) + return 1; + else + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_permit.c b/kernel/generic/zgemm_small_matrix_permit.c new file mode 100644 index 000000000..288937256 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_permit.c @@ -0,0 +1,37 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha0, FLOAT alpha1, FLOAT beta0, FLOAT beta1) +{ + double MNK = (double) M * (double) N * (double) K; + if (MNK <= 100.0*100.0*100.0) + return 1; + else + return 0; +} From 02c6e764f2e94779ae5699ca2ea8c2189aa9fa02 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 27 May 2021 11:26:49 +0000 Subject: [PATCH 041/143] Small Matrix: skylakex: add SGEMM_SMALL_M_PERMIT and tune for TN kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 1 + .../sgemm_small_kernel_permit_skylakex.c | 50 +++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 kernel/x86_64/sgemm_small_kernel_permit_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 5e0d9e5b4..264e3a9f4 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -10,6 +10,7 @@ STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +SGEMM_SMALL_M_PERMIT = sgemm_small_kernel_permit_skylakex.c SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c diff --git a/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c new file mode 100644 index 000000000..159ae10b5 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c @@ -0,0 +1,50 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + if (MNK > 100.0*100.0*100.0) // disable for big size matrix + return 0; + // tuning for A transpose + if (transa) { + if (transb) { + return 0; // TT kernel not support yet + } else { // TN kernel + /* TN kernel perform not good when: + * 1. C matrix is too big + * 2. K is too small + */ + if (M * N > 1200 || K < 32) + return 0; + } + } + + return 1; +} From 72e070539cd13364c8a02ac34e3dfcd65b657c7a Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Mon, 31 May 2021 14:53:03 +0000 Subject: [PATCH 042/143] Small Matrix: skylakex: add sgemm tt kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../sgemm_small_kernel_b0_tt_skylakex.c | 3 + .../sgemm_small_kernel_permit_skylakex.c | 7 +- .../x86_64/sgemm_small_kernel_tt_skylakex.c | 414 ++++++++++++++++++ 4 files changed, 424 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c create mode 100644 kernel/x86_64/sgemm_small_kernel_tt_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 264e3a9f4..0f58a4d46 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -17,6 +17,8 @@ SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_b0_nt_skylakex.c SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_skylakex.c SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_b0_tn_skylakex.c +SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_skylakex.c +SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_b0_tt_skylakex.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c diff --git a/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c new file mode 100644 index 000000000..27d9e0afd --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c @@ -0,0 +1,3 @@ +#define B0 1 +#define TT 1 +#include "./sgemm_small_kernel_tt_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c index 159ae10b5..cbf2374bd 100644 --- a/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c @@ -35,8 +35,11 @@ int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alph // tuning for A transpose if (transa) { if (transb) { - return 0; // TT kernel not support yet - } else { // TN kernel + /* TT kernel perform not good when: + * 1. K is too small. + */ + if (K < 4) return 0; + } else { /* TN kernel perform not good when: * 1. C matrix is too big * 2. K is too small diff --git a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c new file mode 100644 index 000000000..8da560ef7 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c @@ -0,0 +1,414 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include + +#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() +#define BROADCAST_LOAD_A_512(M, N) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k + lda * (i+M)])) +#define LOAD_B_512(M,N) __m512 Bval##N = _mm512_loadu_ps(&B[ldb * k + j + (N*16)]) +#define MASK_LOAD_B_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[ldb * k + j + (N*16)]) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) + +#if defined(B0) +#define STORE_8xy(v, N, x, y) _mm256_storeu_ps(&C[(j + N*16 + x + y*8)*ldc + i], v) +#define STORE_4xy(v, N, x, y) _mm_mask_storeu_ps(&C[(j + N*16 + x + y*4)*ldc + i], mask8, v) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4); +#else +#define STORE_8xy(v, N, x, y) \ + asm("vfmadd231ps (%1), %2, %0": "+v"(v): "r"(&C[(j + N*16 + x + y*8)*ldc + i]), "v"(beta_256)); \ + _mm256_storeu_ps(&C[(j + N*16 + x + y*8)*ldc + i], v) +#define STORE_4xy(v, N, x, y) \ + asm("vfmadd231ps (%1), %2, %0": "+v"(v): "r"(&C[(j + N*16 + x + y*4)*ldc + i]), "v"(beta_128)); \ + _mm_mask_storeu_ps(&C[(j + N*16 + x + y*4)*ldc + i], mask8, v) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + __m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ + result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \ + _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + __m512 tmp##M##N = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask, vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ + result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \ + _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4); +#endif + +#define REORDER_8x16(r0, r1, r2, r3, r4, r5, r6, r7) \ + __m512 t0, t1, t2, t3, t4, t5, t6, t7, v; \ + t0 = _mm512_unpacklo_ps(r0, r1); \ + t1 = _mm512_unpackhi_ps(r0, r1); \ + t2 = _mm512_unpacklo_ps(r2, r3); \ + t3 = _mm512_unpackhi_ps(r2, r3); \ + t4 = _mm512_unpacklo_ps(r4, r5); \ + t5 = _mm512_unpackhi_ps(r4, r5); \ + t6 = _mm512_unpacklo_ps(r6, r7); \ + t7 = _mm512_unpackhi_ps(r6, r7); \ + v = _mm512_shuffle_ps(t0, t2, 0x4E); \ + r0 = _mm512_mask_blend_ps(kc, t0, v); \ + r1 = _mm512_mask_blend_ps(k3, t2, v); \ + v = _mm512_shuffle_ps(t1, t3, 0x4E); \ + r2 = _mm512_mask_blend_ps(kc, t1, v); \ + r3 = _mm512_mask_blend_ps(k3, t3, v); \ + v = _mm512_shuffle_ps(t4, t6, 0x4E); \ + r4 = _mm512_mask_blend_ps(kc, t4, v); \ + r5 = _mm512_mask_blend_ps(k3, t6, v); \ + v = _mm512_shuffle_ps(t5, t7, 0x4E); \ + r6 = _mm512_mask_blend_ps(kc, t5, v); \ + r7 = _mm512_mask_blend_ps(k3, t7, v); \ + t0 = _mm512_permutex2var_ps(r0, idx_lo, r4); \ + t1 = _mm512_permutex2var_ps(r1, idx_lo, r5); \ + t2 = _mm512_permutex2var_ps(r2, idx_lo, r6); \ + t3 = _mm512_permutex2var_ps(r3, idx_lo, r7); \ + t4 = _mm512_permutex2var_ps(r0, idx_hi, r4); \ + t5 = _mm512_permutex2var_ps(r1, idx_hi, r5); \ + t6 = _mm512_permutex2var_ps(r2, idx_hi, r6); \ + t7 = _mm512_permutex2var_ps(r3, idx_hi, r7); \ + t0 = _mm512_mul_ps(t0, alpha_512); \ + t1 = _mm512_mul_ps(t1, alpha_512); \ + t2 = _mm512_mul_ps(t2, alpha_512); \ + t3 = _mm512_mul_ps(t3, alpha_512); \ + t4 = _mm512_mul_ps(t4, alpha_512); \ + t5 = _mm512_mul_ps(t5, alpha_512); \ + t6 = _mm512_mul_ps(t6, alpha_512); \ + t7 = _mm512_mul_ps(t7, alpha_512); + +#define SAVE_8(N, x, y) {\ + __m256 v8 = _mm512_extractf32x8_ps(t##x, y); \ + STORE_8xy(v8, N, x, y); \ +} + +#define REORDER_STORE_8x16(N) {\ + REORDER_8x16(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \ + SAVE_8(N, 0, 0); SAVE_8(N, 1, 0); SAVE_8(N, 2, 0); SAVE_8(N, 3, 0); SAVE_8(N, 4, 0); SAVE_8(N, 5, 0); SAVE_8(N, 6, 0); SAVE_8(N, 7, 0); \ + SAVE_8(N, 0, 1); SAVE_8(N, 1, 1); SAVE_8(N, 2, 1); SAVE_8(N, 3, 1); SAVE_8(N, 4, 1); SAVE_8(N, 5, 1); SAVE_8(N, 6, 1); SAVE_8(N, 7, 1); \ +} + +#define MASK_SAVE_8() \ + switch (nn) { \ + case 16: SAVE_8(0, 7, 1); \ + case 15: SAVE_8(0, 6, 1); \ + case 14: SAVE_8(0, 5, 1); \ + case 13: SAVE_8(0, 4, 1); \ + case 12: SAVE_8(0, 3, 1); \ + case 11: SAVE_8(0, 2, 1); \ + case 10: SAVE_8(0, 1, 1); \ + case 9: SAVE_8(0, 0, 1); \ + case 8: SAVE_8(0, 7, 0); \ + case 7: SAVE_8(0, 6, 0); \ + case 6: SAVE_8(0, 5, 0); \ + case 5: SAVE_8(0, 4, 0); \ + case 4: SAVE_8(0, 3, 0); \ + case 3: SAVE_8(0, 2, 0); \ + case 2: SAVE_8(0, 1, 0); \ + case 1: SAVE_8(0, 0, 0); \ + } + +#define MASK_REORDER_STORE_8x16(N) {\ + REORDER_8x16(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \ + MASK_SAVE_8(); \ +} + +#define REORDER_4x16(r0, r1, r2, r3) \ + __m512 t0, t1, t2, t3, v; \ + t0 = _mm512_unpacklo_ps(r0, r1); \ + t1 = _mm512_unpackhi_ps(r0, r1); \ + t2 = _mm512_unpacklo_ps(r2, r3); \ + t3 = _mm512_unpackhi_ps(r2, r3); \ + v = _mm512_shuffle_ps(t0, t2, 0x4E); \ + r0 = _mm512_mask_blend_ps(kc, t0, v); \ + r1 = _mm512_mask_blend_ps(k3, t2, v); \ + v = _mm512_shuffle_ps(t1, t3, 0x4E); \ + r2 = _mm512_mask_blend_ps(kc, t1, v); \ + r3 = _mm512_mask_blend_ps(k3, t3, v); \ + t0 = _mm512_mul_ps(r0, alpha_512); \ + t1 = _mm512_mul_ps(r1, alpha_512); \ + t2 = _mm512_mul_ps(r2, alpha_512); \ + t3 = _mm512_mul_ps(r3, alpha_512); + +#define SAVE_4(N, x, y) {\ + __m128 v4 = _mm512_extractf32x4_ps(t##x, y); \ + STORE_4xy(v4, N, x, y); \ +} + +#define REORDER_STORE_4x16(N) {\ + REORDER_4x16(result0##N, result1##N, result2##N, result3##N); \ + SAVE_4(N, 0, 0); SAVE_4(N, 1, 0); SAVE_4(N, 2, 0); SAVE_4(N, 3, 0); \ + SAVE_4(N, 0, 1); SAVE_4(N, 1, 1); SAVE_4(N, 2, 1); SAVE_4(N, 3, 1); \ + SAVE_4(N, 0, 2); SAVE_4(N, 1, 2); SAVE_4(N, 2, 2); SAVE_4(N, 3, 2); \ + SAVE_4(N, 0, 3); SAVE_4(N, 1, 3); SAVE_4(N, 2, 3); SAVE_4(N, 3, 3); \ +} + +#define MASK_SAVE_4() \ + switch (nn) { \ + case 16: SAVE_4(0, 3, 3); \ + case 15: SAVE_4(0, 2, 3); \ + case 14: SAVE_4(0, 1, 3); \ + case 13: SAVE_4(0, 0, 3); \ + case 12: SAVE_4(0, 3, 2); \ + case 11: SAVE_4(0, 2, 2); \ + case 10: SAVE_4(0, 1, 2); \ + case 9: SAVE_4(0, 0, 2); \ + case 8: SAVE_4(0, 3, 1); \ + case 7: SAVE_4(0, 2, 1); \ + case 6: SAVE_4(0, 1, 1); \ + case 5: SAVE_4(0, 0, 1); \ + case 4: SAVE_4(0, 3, 0); \ + case 3: SAVE_4(0, 2, 0); \ + case 2: SAVE_4(0, 1, 0); \ + case 1: SAVE_4(0, 0, 0); \ + } + +#define MASK_REORDER_STORE_4x16(N) {\ + REORDER_4x16(result0##N, result1##N, result2##N, result3##N); \ + MASK_SAVE_4(); \ +} + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n64 = N & ~63; + BLASLONG n32 = N & ~31; + + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta)); + __m128 beta_128 = _mm_broadcastss_ps(_mm_load_ss(&beta)); +#endif + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + }; + __m512i idx_lo = _mm512_loadu_epi32(permute_table); + __m512i idx_hi = _mm512_loadu_epi32(permute_table + 16); + __mmask16 kc = 0xcccc; + __mmask16 k3 = 0x3333; + __mmask8 mask8 = 0xff; // force use AVX128 instead of SSE + + for (i = 0; i < m8; i += 8) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0); + + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(4, 1); DECLARE_RESULT_512(5, 1); DECLARE_RESULT_512(6, 1); DECLARE_RESULT_512(7, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(4, 1); MATMUL_512(5, 1); MATMUL_512(6, 1); MATMUL_512(7, 1); + } + REORDER_STORE_8x16(0); + REORDER_STORE_8x16(1); + } + __mmask16 mask = 0xffff; + int nn = 16; + for (; j < N; j += 16) { + if (N - j < 16) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0); + } + MASK_REORDER_STORE_8x16(0); + } + } + for (; i < m4; i += 4) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + REORDER_STORE_4x16(0); + REORDER_STORE_4x16(1); + REORDER_STORE_4x16(2); + REORDER_STORE_4x16(3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + REORDER_STORE_4x16(0); + REORDER_STORE_4x16(1); + } + __mmask16 mask = 0xffff; + int nn = 16; + for (; j < N; j += 16) { + if (N - j < 16) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + MASK_REORDER_STORE_4x16(0); + } + } + if (i < M) { + int index_n[16]; + for (int ii = 0; ii < 16; ii++) { + index_n[ii] = ii * ldc; + } + __m512i vindex_n = _mm512_loadu_epi32(index_n); +#if !defined(B0) + __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); +#endif + for (; i < m2; i += 2) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + } + __mmask16 mask = 0xffff; + int nn = 16; + for (; j < N; j += 16) { + if (N - j < 16) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + SCATTER_STORE_512(0, 2); + SCATTER_STORE_512(0, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + } + __mmask16 mask = 0xffff; + int nn = 16; + for (; j < N; j += 16) { + if (N - j < 16) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_SCATTER_STORE_512(0, 0); + } + } + } + return 0; +} From 91ec21202bd8ae81f15dae79e004b2f00d20e559 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 1 Jun 2021 11:31:50 +0000 Subject: [PATCH 043/143] Small Matrix: skylakex: add dgemm nn kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../dgemm_small_kernel_b0_nn_skylakex.c | 2 + .../x86_64/dgemm_small_kernel_nn_skylakex.c | 590 ++++++++++++++++++ 3 files changed, 594 insertions(+) create mode 100644 kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c create mode 100644 kernel/x86_64/dgemm_small_kernel_nn_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 0f58a4d46..a3c6f0556 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -27,6 +27,8 @@ DGEMMITCOPY = dgemm_tcopy_16_skylakex.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c +DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c new file mode 100644 index 000000000..a58738a25 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./dgemm_small_kernel_nn_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c new file mode 100644 index 000000000..8ffb899c8 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -0,0 +1,590 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd() +#define LOAD_A_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[lda * k + i + (M*8)]) +#define MASK_LOAD_A_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[lda * k + i + (M*8)]) +#define BROADCAST_LOAD_B_512(M, N) __m512d Bval##N = _mm512_broadcastsd_pd(_mm_load_pd1(&B[k + ldb * (j+N)])) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N) +#if defined(B0) +#define STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) +#else +#define STORE_512(M, N) \ + result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + asm("vfmadd231pd (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512)); \ + _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#define MASK_STORE_512(M, N) \ + result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) +#endif + +#define LOAD_KA_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&mbuf[(mi + M)*K + k]); +#define LOAD_KB_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[(j + N)*ldb + k]) +#define MASK_LOAD_KA_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &mbuf[(mi + M)*K + k]) +#define MASK_LOAD_KB_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[(j + N)*ldb + k]) +#define REDUCE_4(rr0, rr1, rr2, rr3) \ + __m512d r0, r1, r2, r3, t0, t1, t2, t3;\ + r0 = _mm512_unpacklo_pd(rr0, rr1); r1 = _mm512_unpackhi_pd(rr0, rr1); \ + r2 = _mm512_unpacklo_pd(rr2, rr3); r3 = _mm512_unpackhi_pd(rr2, rr3); \ + t0 = _mm512_permutex2var_pd(r0, idx_lo, r2); t1 = _mm512_permutex2var_pd(r1, idx_lo, r3); \ + t2 = _mm512_permutex2var_pd(r0, idx_hi, r2); t3 = _mm512_permutex2var_pd(r1, idx_hi, r3); \ + r0 = _mm512_add_pd(t0, t1); r1 = _mm512_add_pd(t2, t3); t0 = _mm512_add_pd(r0, r1); \ + __m256d s0, s1; \ + s0 = _mm512_extractf64x4_pd(t0, 0); s1 = _mm512_extractf64x4_pd(t0, 1); \ + s0 = _mm256_add_pd(s0, s1); s0 = _mm256_mul_pd(alpha_256, s0); +#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N) +#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3) +#if defined(B0) +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N); +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + _mm256_storeu_pd(&C[(j + N)*ldc + i], s0); \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); \ +} +#else +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) + beta * C[(j+N)*ldc + i + M]; +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + asm("vfmadd231pd (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_256)); \ + _mm256_storeu_pd(&C[(j + N)*ldc + i], s0); \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + s1 = _mm256_i64gather_pd(&C[j*ldc + i + M], vindex_n, 8); \ + s0 = _mm256_fmadd_pd(s1, beta_256, s0); \ + _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); \ +} +#endif + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m32 = M & ~31; + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n6 = N - (N % 6); + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + + __m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_pd1(&alpha)); +#if !defined(B0) + __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_pd1(&beta)); +#endif + + for (i = 0; i < m32; i += 32) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + } + for (; i < m16; i += 16) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + MATMUL_512(0, 4); MATMUL_512(1, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + STORE_512(0, 4); STORE_512(1, 4); + STORE_512(0, 5); STORE_512(1, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + } + for (; i < m8; i += 8) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + STORE_512(0, 4); + STORE_512(0, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + } + int mm = M - i; + if (!mm) return 0; + if (mm > 4 || K < 16) { + register __mmask8 mask asm("k1") = (1UL << mm) - 1; + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); + MASK_STORE_512(0, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_STORE_512(0, 0); + } + } else { + /* M => [1, 4] + * + * This kernel use dot-like style to calc a value - C(x, y): + * C(x, y) = A(x, 0)*B(0, y) + A(x, 1)*B(1, y) +....+ A(x, K)*B(K, y) + * + * Alloc a buf to copy rest of A as row major, + * so memory access from 0 to K is continuous for both A & B. + * + * Loading to zmm and FMA 8 of k at one loop, + * finally reduce_add zmm to a single float result in C(x, y). + * + * Note: performance is bad when K is small. + */ + FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K); + __mmask8 mask = (1UL << mm) - 1; + BLASLONG k8 = K & ~7; + BLASLONG k4 = K & ~3; + for (k = 0; k < k4; k += 4) { + __m256d r0, r1, r2, r3; + __m256d t0, t1, t2, t3; + r0 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(0 + k)]); + r1 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(1 + k)]); + r2 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(2 + k)]); + r3 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(3 + k)]); + + t0 = _mm256_unpacklo_pd(r0, r1); + t1 = _mm256_unpackhi_pd(r0, r1); + t2 = _mm256_unpacklo_pd(r2, r3); + t3 = _mm256_unpackhi_pd(r2, r3); + + r0 = _mm256_permute2f128_pd(t0, t2, 0x20); + r1 = _mm256_permute2f128_pd(t1, t3, 0x20); + r2 = _mm256_permute2f128_pd(t0, t2, 0x31); + r3 = _mm256_permute2f128_pd(t1, t3, 0x31); + + switch (mm) { + case 4: _mm256_storeu_pd(&mbuf[k + 3*K], r3); + case 3: _mm256_storeu_pd(&mbuf[k + 2*K], r2); + case 2: _mm256_storeu_pd(&mbuf[k + 1*K], r1); + case 1: _mm256_storeu_pd(&mbuf[k + 0*K], r0); + } + } + for (; k < K; k++) { + for (int ii = 0; ii < mm; ii++) { + mbuf[k + ii*K] = A[i + lda*k + ii]; + } + } + int mi = 0; + __m256d alpha_256 = _mm256_broadcast_sd(&alpha); +#if !defined(B0) + __m256d beta_256 = _mm256_broadcast_sd(&beta); +#endif + __m256i vindex_n = _mm256_set_epi64x(ldc*3, ldc*2, ldc*1, 0); + long long permute_table[] = { + 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, + 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, + }; + __m512i idx_lo = _mm512_loadu_epi32(permute_table); + __m512i idx_hi = _mm512_loadu_epi32(permute_table + 8); + for (; i < m4; i += 4, mi += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_REDUCE_M4(0); + } + + } + for (; i < m2; i += 2, mi += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_REDUCE_N4(0); STORE_REDUCE_N4(1); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + } + } + for (; i < M; i += 1, mi += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_REDUCE_N4(0); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + STORE_REDUCE(0, 0); + } + } + free(mbuf); + } + return 0; +} From f57fc932ac39c394e8f89bf7b6df3f1bddd315fd Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 1 Jun 2021 14:23:56 +0000 Subject: [PATCH 044/143] Small Matrix: skylakex: add dgemm nt kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../dgemm_small_kernel_b0_nt_skylakex.c | 2 + .../x86_64/dgemm_small_kernel_nt_skylakex.c | 535 ++++++++++++++++++ 3 files changed, 539 insertions(+) create mode 100644 kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c create mode 100644 kernel/x86_64/dgemm_small_kernel_nt_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index a3c6f0556..db1e6cbff 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -29,6 +29,8 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c +DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c +DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_b0_nt_skylakex.c SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c new file mode 100644 index 000000000..eafe2ce49 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./dgemm_small_kernel_nt_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c new file mode 100644 index 000000000..0a95a68e2 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c @@ -0,0 +1,535 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd() +#define LOAD_A_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[lda * k + i + (M*8)]) +#define MASK_LOAD_A_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[lda * k + i + (M*8)]) +#define BROADCAST_LOAD_B_512(M, N) __m512d Bval##N = _mm512_broadcastsd_pd(_mm_load_sd(&B[ldb * k + j + N])) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N) + +#define BROADCAST_LOAD_A_512(M, N) __m512d Aval##M = _mm512_broadcastsd_pd(_mm_load_sd(&A[lda * k + i + M])) +#define LOAD_B_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[ldb * k + j + (N*8)]) +#define MASK_LOAD_B_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[ldb * k + j + (N*8)]) +#if defined(B0) +#define STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8) +#else +#define STORE_512(M, N) \ + result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + asm("vfmadd231pd (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512)); \ + _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#define MASK_STORE_512(M, N) \ + result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + __m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ + result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \ + _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + __m512d tmp##M##N = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), mask, vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ + result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \ + _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8); +#endif + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m32 = M & ~31; + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n32 = N & ~31; + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n6 = N - (N % 6); + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + + __m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_sd(&alpha)); +#if !defined(B0) + __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); +#endif + + for (i = 0; i < m32; i += 32) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); DECLARE_RESULT_512(2, 4); DECLARE_RESULT_512(3, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); DECLARE_RESULT_512(2, 5); DECLARE_RESULT_512(3, 5); + + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + MATMUL_512(0, 4); MATMUL_512(1, 4); MATMUL_512(2, 4); MATMUL_512(3, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); MATMUL_512(2, 5); MATMUL_512(3, 5); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + STORE_512(0, 4); STORE_512(1, 4); STORE_512(2, 4); STORE_512(3, 4); + STORE_512(0, 5); STORE_512(1, 5); STORE_512(2, 5); STORE_512(3, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + } + for (; i < m16; i += 16) { + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); + DECLARE_RESULT_512(0, 6); DECLARE_RESULT_512(1, 6); + DECLARE_RESULT_512(0, 7); DECLARE_RESULT_512(1, 7); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + MATMUL_512(0, 4); MATMUL_512(1, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); + MATMUL_512(0, 6); MATMUL_512(1, 6); + MATMUL_512(0, 7); MATMUL_512(1, 7); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + STORE_512(0, 4); STORE_512(1, 4); + STORE_512(0, 5); STORE_512(1, 5); + STORE_512(0, 6); STORE_512(1, 6); + STORE_512(0, 7); STORE_512(1, 7); + } + for (;j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + } + for (; i < m8; i += 8) { + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + DECLARE_RESULT_512(0, 6); + DECLARE_RESULT_512(0, 7); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + MATMUL_512(0, 6); + MATMUL_512(0, 7); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + STORE_512(0, 4); + STORE_512(0, 5); + STORE_512(0, 6); + STORE_512(0, 7); + } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + } + int mm = M - i; + if (mm >= 6) { + register __mmask16 mask asm("k1") = (1UL << mm) - 1; + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + DECLARE_RESULT_512(0, 6); + DECLARE_RESULT_512(0, 7); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + MATMUL_512(0, 6); + MATMUL_512(0, 7); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); + MASK_STORE_512(0, 5); + MASK_STORE_512(0, 6); + MASK_STORE_512(0, 7); + } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + } + + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_STORE_512(0, 0); + } + } else if (mm > 0) { + long long index_n[8]; + for (int ii = 0; ii < 8; ii++) { + index_n[ii] = ii * ldc; + } + __m512i vindex_n = _mm512_loadu_epi64(index_n); + for (; i < m4; i += 4) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); SCATTER_STORE_512(2, 2); SCATTER_STORE_512(3, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); SCATTER_STORE_512(2, 3); SCATTER_STORE_512(3, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1); + } + __mmask8 mask = 0xff; + for (; j < N; j += 8) { + int remains = N - j; + if (remains < 8) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); MASK_SCATTER_STORE_512(2, 0); MASK_SCATTER_STORE_512(3, 0); + } + } + for (; i < m2; i += 2) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + } + __mmask8 mask = 0xff; + for (; j < N; j += 8) { + int remains = N - j; + if (remains < 8) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + SCATTER_STORE_512(0, 2); + SCATTER_STORE_512(0, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + } + __mmask8 mask = 0xff; + for (; j < N; j += 8) { + int remains = N - j; + if (remains < 8) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_SCATTER_STORE_512(0, 0); + } + } + } + return 0; +} From 323d7da4f7c21b0a285af1527a47799c4adf69f4 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 2 Jun 2021 11:45:44 +0000 Subject: [PATCH 045/143] Small Matrix: skylakex: add dgemm tt kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../dgemm_small_kernel_b0_tt_skylakex.c | 2 + .../x86_64/dgemm_small_kernel_tt_skylakex.c | 392 ++++++++++++++++++ 3 files changed, 396 insertions(+) create mode 100644 kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c create mode 100644 kernel/x86_64/dgemm_small_kernel_tt_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index db1e6cbff..3e84e794e 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -31,6 +31,8 @@ DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_b0_nt_skylakex.c +DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_skylakex.c +DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_b0_tt_skylakex.c SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c new file mode 100644 index 000000000..93fab1836 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./dgemm_small_kernel_tt_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c new file mode 100644 index 000000000..8ff79d2c8 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c @@ -0,0 +1,392 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include + +#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd() +#define BROADCAST_LOAD_A_512(M, N) __m512d Aval##M = _mm512_broadcastsd_pd(_mm_load_sd(&A[k + lda * (i+M)])) +#define LOAD_B_512(M,N) __m512d Bval##N = _mm512_loadu_pd(&B[ldb * k + j + (N*8)]) +#define MASK_LOAD_B_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[ldb * k + j + (N*8)]) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N) + +#if defined(B0) +#define STORE_8xy(v, N, x, y) _mm512_storeu_pd(&C[(j + N*8 + x + y*8)*ldc + i], v) +#define STORE_4xy(v, N, x, y) _mm256_storeu_pd(&C[(j + N*8 + x + y*4)*ldc + i], v) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8); +#else +#define STORE_8xy(v, N, x, y) \ + asm("vfmadd231pd (%1), %2, %0": "+v"(v): "r"(&C[(j + N*8 + x + y*8)*ldc + i]), "v"(beta_512)); \ + _mm512_storeu_pd(&C[(j + N*8 + x + y*8)*ldc + i], v) +#define STORE_4xy(v, N, x, y) \ + asm("vfmadd231pd (%1), %2, %0": "+v"(v): "r"(&C[(j + N*8 + x + y*4)*ldc + i]), "v"(beta_256)); \ + _mm256_storeu_pd(&C[(j + N*8 + x + y*4)*ldc + i], v) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + __m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ + result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \ + _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + __m512d tmp##M##N = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), mask, vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ + result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \ + _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8); +#endif + +#define REORDER_8x8(r0, r1, r2, r3, r4, r5, r6, r7) \ + __m512d t0, t1, t2, t3, t4, t5, t6, t7; \ + t0 = _mm512_unpacklo_pd(r0, r1); \ + t1 = _mm512_unpackhi_pd(r0, r1); \ + t2 = _mm512_unpacklo_pd(r2, r3); \ + t3 = _mm512_unpackhi_pd(r2, r3); \ + t4 = _mm512_unpacklo_pd(r4, r5); \ + t5 = _mm512_unpackhi_pd(r4, r5); \ + t6 = _mm512_unpacklo_pd(r6, r7); \ + t7 = _mm512_unpackhi_pd(r6, r7); \ + r0 = _mm512_shuffle_f64x2(t0, t2, 0x88); \ + r1 = _mm512_shuffle_f64x2(t1, t3, 0x88); \ + r2 = _mm512_shuffle_f64x2(t0, t2, 0xdd); \ + r3 = _mm512_shuffle_f64x2(t1, t3, 0xdd); \ + r4 = _mm512_shuffle_f64x2(t4, t6, 0x88); \ + r5 = _mm512_shuffle_f64x2(t5, t7, 0x88); \ + r6 = _mm512_shuffle_f64x2(t4, t6, 0xdd); \ + r7 = _mm512_shuffle_f64x2(t5, t7, 0xdd); \ + t0 = _mm512_permutex2var_pd(r0, idx_lo, r4); \ + t1 = _mm512_permutex2var_pd(r1, idx_lo, r5); \ + t2 = _mm512_permutex2var_pd(r2, idx_lo, r6); \ + t3 = _mm512_permutex2var_pd(r3, idx_lo, r7); \ + t4 = _mm512_permutex2var_pd(r0, idx_hi, r4); \ + t5 = _mm512_permutex2var_pd(r1, idx_hi, r5); \ + t6 = _mm512_permutex2var_pd(r2, idx_hi, r6); \ + t7 = _mm512_permutex2var_pd(r3, idx_hi, r7); \ + t0 = _mm512_mul_pd(t0, alpha_512); \ + t1 = _mm512_mul_pd(t1, alpha_512); \ + t2 = _mm512_mul_pd(t2, alpha_512); \ + t3 = _mm512_mul_pd(t3, alpha_512); \ + t4 = _mm512_mul_pd(t4, alpha_512); \ + t5 = _mm512_mul_pd(t5, alpha_512); \ + t6 = _mm512_mul_pd(t6, alpha_512); \ + t7 = _mm512_mul_pd(t7, alpha_512); + +#define SAVE_8(N, x) {\ + STORE_8xy(t##x, N, x, 0); \ +} + +#define REORDER_STORE_8x8(N) {\ + REORDER_8x8(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \ + SAVE_8(N, 0); SAVE_8(N, 1); SAVE_8(N, 2); SAVE_8(N, 3); SAVE_8(N, 4); SAVE_8(N, 5); SAVE_8(N, 6); SAVE_8(N, 7); \ +} + +#define MASK_SAVE_8() \ + switch (nn) { \ + case 8: SAVE_8(0, 7); \ + case 7: SAVE_8(0, 6); \ + case 6: SAVE_8(0, 5); \ + case 5: SAVE_8(0, 4); \ + case 4: SAVE_8(0, 3); \ + case 3: SAVE_8(0, 2); \ + case 2: SAVE_8(0, 1); \ + case 1: SAVE_8(0, 0); \ + } + +#define MASK_REORDER_STORE_8x8(N) {\ + REORDER_8x8(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \ + MASK_SAVE_8(); \ +} + +#define REORDER_4x8(r0, r1, r2, r3) \ + __m512d t0, t1, t2, t3; \ + t0 = _mm512_unpacklo_pd(r0, r1); \ + t1 = _mm512_unpackhi_pd(r0, r1); \ + t2 = _mm512_unpacklo_pd(r2, r3); \ + t3 = _mm512_unpackhi_pd(r2, r3); \ + r0 = _mm512_permutex2var_pd(t0, idx_lo, t2); \ + r1 = _mm512_permutex2var_pd(t1, idx_lo, t3); \ + r2 = _mm512_permutex2var_pd(t0, idx_hi, t2); \ + r3 = _mm512_permutex2var_pd(t1, idx_hi, t3); \ + t0 = _mm512_mul_pd(r0, alpha_512); \ + t1 = _mm512_mul_pd(r1, alpha_512); \ + t2 = _mm512_mul_pd(r2, alpha_512); \ + t3 = _mm512_mul_pd(r3, alpha_512); + +#define SAVE_4(N, x, y) {\ + __m256d v4 = _mm512_extractf64x4_pd(t##x, y); \ + STORE_4xy(v4, N, x, y); \ +} + +#define REORDER_STORE_4x8(N) {\ + REORDER_4x8(result0##N, result1##N, result2##N, result3##N); \ + SAVE_4(N, 0, 0); SAVE_4(N, 1, 0); SAVE_4(N, 2, 0); SAVE_4(N, 3, 0); \ + SAVE_4(N, 0, 1); SAVE_4(N, 1, 1); SAVE_4(N, 2, 1); SAVE_4(N, 3, 1); \ +} + +#define MASK_SAVE_4() \ + switch (nn) { \ + case 8: SAVE_4(0, 3, 1); \ + case 7: SAVE_4(0, 2, 1); \ + case 6: SAVE_4(0, 1, 1); \ + case 5: SAVE_4(0, 0, 1); \ + case 4: SAVE_4(0, 3, 0); \ + case 3: SAVE_4(0, 2, 0); \ + case 2: SAVE_4(0, 1, 0); \ + case 1: SAVE_4(0, 0, 0); \ + } + +#define MASK_REORDER_STORE_4x8(N) {\ + REORDER_4x8(result0##N, result1##N, result2##N, result3##N); \ + MASK_SAVE_4(); \ +} + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n32 = N & ~31; + BLASLONG n16 = N & ~15; + + __m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_sd(&alpha)); +#if !defined(B0) + __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); + __m256d beta_256 = _mm256_broadcastsd_pd(_mm_load_sd(&beta)); +#endif + long long permute_table[] = { + 0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8, + 2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8, + }; + __m512i idx_lo = _mm512_loadu_epi64(permute_table); + __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + + for (i = 0; i < m8; i += 8) { + for (j = 0; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0); + + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(4, 1); DECLARE_RESULT_512(5, 1); DECLARE_RESULT_512(6, 1); DECLARE_RESULT_512(7, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(4, 1); MATMUL_512(5, 1); MATMUL_512(6, 1); MATMUL_512(7, 1); + } + REORDER_STORE_8x8(0); + REORDER_STORE_8x8(1); + } + __mmask8 mask = 0xff; + int nn = 8; + for (; j < N; j += 8) { + if (N - j < 8) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0); + } + MASK_REORDER_STORE_8x8(0); + } + } + for (; i < m4; i += 4) { + long long permute_table2[] = { + 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, + 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, + }; + idx_lo = _mm512_loadu_epi64(permute_table2); + idx_hi = _mm512_loadu_epi64(permute_table2 + 8); + + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + REORDER_STORE_4x8(0); + REORDER_STORE_4x8(1); + REORDER_STORE_4x8(2); + REORDER_STORE_4x8(3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + REORDER_STORE_4x8(0); + REORDER_STORE_4x8(1); + } + __mmask8 mask = 0xff; + int nn = 8; + for (; j < N; j += 8) { + if (N - j < 8) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + MASK_REORDER_STORE_4x8(0); + } + } + if (i < M) { + long long index_n[8]; + for (int ii = 0; ii < 8; ii++) { + index_n[ii] = ii * ldc; + } + __m512i vindex_n = _mm512_loadu_epi64(index_n); +#if !defined(B0) + __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); +#endif + for (; i < m2; i += 2) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + } + __mmask8 mask = 0xff; + int nn = 8; + for (; j < N; j += 8) { + if (N - j < 8) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + SCATTER_STORE_512(0, 2); + SCATTER_STORE_512(0, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + } + __mmask8 mask = 0xff; + int nn = 8; + for (; j < N; j += 8) { + if (N - j < 8) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_SCATTER_STORE_512(0, 0); + } + } + } + return 0; +} From 3e79f6d89abe60b75a4a504670a676472b2d0918 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 2 Jun 2021 13:56:40 +0000 Subject: [PATCH 046/143] Small Matrix: skylakex: add dgemm tn kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../dgemm_small_kernel_b0_tn_skylakex.c | 2 + .../x86_64/dgemm_small_kernel_tn_skylakex.c | 322 ++++++++++++++++++ 3 files changed, 326 insertions(+) create mode 100644 kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c create mode 100644 kernel/x86_64/dgemm_small_kernel_tn_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 3e84e794e..c1d8f8e89 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -31,6 +31,8 @@ DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_b0_nt_skylakex.c +DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_skylakex.c +DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_b0_tn_skylakex.c DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_skylakex.c DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_b0_tt_skylakex.c diff --git a/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c new file mode 100644 index 000000000..1dfa0aaf1 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./dgemm_small_kernel_tn_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c new file mode 100644 index 000000000..0881f35b2 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -0,0 +1,322 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd() +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N) + +#define LOAD_KA_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[(i + M)*lda + k]); +#define LOAD_KB_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[(j + N)*ldb + k]) +#define MASK_LOAD_KA_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[(i + M)*lda + k]) +#define MASK_LOAD_KB_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[(j + N)*ldb + k]) + +#define REDUCE_4(rr0, rr1, rr2, rr3) \ + __m512d r0, r1, r2, r3, t0, t1, t2, t3;\ + r0 = _mm512_unpacklo_pd(rr0, rr1); r1 = _mm512_unpackhi_pd(rr0, rr1); \ + r2 = _mm512_unpacklo_pd(rr2, rr3); r3 = _mm512_unpackhi_pd(rr2, rr3); \ + t0 = _mm512_permutex2var_pd(r0, idx_lo, r2); t1 = _mm512_permutex2var_pd(r1, idx_lo, r3); \ + t2 = _mm512_permutex2var_pd(r0, idx_hi, r2); t3 = _mm512_permutex2var_pd(r1, idx_hi, r3); \ + r0 = _mm512_add_pd(t0, t1); r1 = _mm512_add_pd(t2, t3); t0 = _mm512_add_pd(r0, r1); \ + __m256d s0, s1; \ + s0 = _mm512_extractf64x4_pd(t0, 0); s1 = _mm512_extractf64x4_pd(t0, 1); \ + s0 = _mm256_add_pd(s0, s1); s0 = _mm256_mul_pd(alpha_256, s0); + +#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N) +#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3) + +#if defined(B0) +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) +#define STORE_M4(N, s0) _mm256_storeu_pd(&C[(j + N)*ldc + i], s0); +#define STORE_N4(M, s0) _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); +#else +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) + beta * C[(j+N)*ldc + i + M] +#define STORE_M4(N, s0) \ + asm("vfmadd231pd (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_256)); \ + _mm256_storeu_pd(&C[(j + N)*ldc + i], s0); + +#define STORE_N4(M, s0) \ + s0 = _mm256_fmadd_pd(_mm256_i64gather_pd(&C[j*ldc + i + M], vindex_n, 8), beta_256, s0); \ + _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); +#endif +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + STORE_M4(N, s0) \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + STORE_N4(M, s0) \ +} + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k8 = K & ~7; + + __mmask8 mask; + + __m256i vindex_n = _mm256_set_epi64x(ldc*3, ldc*2, ldc, 0); + __m256d alpha_256 = _mm256_broadcast_sd(&alpha); +#if !defined(B0) + __m256d beta_256 = _mm256_broadcast_sd(&beta); +#endif + + long long permute_table[] = { + 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, + 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, + }; + __m512i idx_lo = _mm512_loadu_epi64(permute_table); + __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + + for (i = 0; i < m4; i += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_REDUCE_M4(0); + } + + } + for (; i < m2; i += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_REDUCE_N4(0); STORE_REDUCE_N4(1); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_REDUCE_N4(0); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + STORE_REDUCE(0, 0); + } + } + return 0; +} From 8592c21af4d6328068b87f402a6801b30e2aebec Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 2 Jun 2021 13:57:39 +0000 Subject: [PATCH 047/143] Small Matrix: skylakex: dgemm nn: fix typo in idx load --- kernel/x86_64/dgemm_small_kernel_nn_skylakex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index 8ffb899c8..ff2a04beb 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -372,8 +372,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, }; - __m512i idx_lo = _mm512_loadu_epi32(permute_table); - __m512i idx_hi = _mm512_loadu_epi32(permute_table + 8); + __m512i idx_lo = _mm512_loadu_epi64(permute_table); + __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); for (; i < m4; i += 4, mi += 4) { for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); From fa777f5517d4b43acfda8b8a58649af94c1e40b4 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 2 Jun 2021 14:55:54 +0000 Subject: [PATCH 048/143] Small Matrix: skylakex: add DGEMM_SMALL_M_PERMIT and tune for TN kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 1 + .../dgemm_small_kernel_permit_skylakex.c | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 kernel/x86_64/dgemm_small_kernel_permit_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index c1d8f8e89..eb0cbaf98 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -27,6 +27,7 @@ DGEMMITCOPY = dgemm_tcopy_16_skylakex.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DGEMM_SMALL_M_PERMIT = dgemm_small_kernel_permit_skylakex.c DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c diff --git a/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c new file mode 100644 index 000000000..9cca08e71 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c @@ -0,0 +1,44 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + if (MNK > 100.0*100.0*100.0) // disable for big size matrix + return 0; + if (transa && !transb) { + /* TN kernel perform not good when: + * 1. C matrix is too big + * 2. K is too small + */ + if (M * N > 1200 || K < 32) + return 0; + } + return 1; +} From 210a1584c5299d8e53129b4e2a8b73b67046cc77 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Aug 2021 14:19:16 +0200 Subject: [PATCH 049/143] Rebase source and edit TLS version of the message as well --- driver/others/memory.c | 46 +++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index f0521ab2d..500ec22c5 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -222,11 +222,11 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; + +#if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; int ret; - -#if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) @@ -428,7 +428,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) int max_num; #endif int blas_goto_num = 0; @@ -436,7 +436,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) max_num = get_num_procs(); #endif @@ -460,7 +460,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock); func = &memoryalloc[0]; - while ((func != NULL) && (map_address == (void *) -1)) { + while ((*func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); @@ -1619,10 +1619,12 @@ static int on_process_term(void) #else #pragma data_seg(".CRT$XLB") #endif -static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; + #ifdef _WIN64 +static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma const_seg() #else +static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma data_seg() #endif @@ -1631,10 +1633,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI #else #pragma data_seg(".CRT$XTU") #endif -static int(*p_process_term)(void) = on_process_term; + #ifdef _WIN64 +static const int(*p_process_term)(void) = on_process_term; #pragma const_seg() #else +static int(*p_process_term)(void) = on_process_term; #pragma data_seg() #endif #endif @@ -1668,16 +1672,23 @@ void gotoblas_dummy_for_PGI(void) { #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 #endif -#else +#elif !defined(OS_EMBEDDED) #define ALLOC_MMAP #define ALLOC_MALLOC +#else +#define ALLOC_MALLOC + +inline int puts(const char *str) { return 0; } +inline int printf(const char *format, ...) { return 0; } +inline char *getenv(const char *name) { return ""; } +inline int atoi(const char *str) { return 0; } #endif #include #include #include -#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) +#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED) #include #ifndef NO_SYSV_IPC #include @@ -1691,7 +1702,6 @@ void gotoblas_dummy_for_PGI(void) { #include #include #include -#include #include #include #include @@ -1969,7 +1979,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) int max_num; #endif int blas_goto_num = 0; @@ -1977,7 +1987,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) max_num = get_num_procs(); #endif @@ -2001,7 +2011,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -2868,8 +2878,12 @@ void *blas_memory_alloc(int procpos){ return (void *)memory[position].addr; error: - printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); - + printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); + printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); + printf("a sufficiently small number. This error typically occurs when the software that relies on\n"); + printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n"); + printf("cpu cores than what OpenBLAS was configured to handle.\n"); return NULL; } From 898212efcda215ccab3b46b4a645c8eda2ca7948 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Aug 2021 14:50:14 +0200 Subject: [PATCH 050/143] Actually add the message to the TLS section --- driver/others/memory.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 500ec22c5..460a3d557 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1291,7 +1291,12 @@ UNLOCK_COMMAND(&alloc_lock); return (void *)(((char *)alloc_info) + sizeof(struct alloc_t)); error: - printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); + printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n"); + printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); + printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); + printf("a sufficiently small number. This error typically occurs when the software that relies on\n"); + printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n"); + printf("cpu cores than what OpenBLAS was configured to handle.\n"); return NULL; } From 6b58bca18b427a0c149d25542a5eb7c5ada6a19f Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 15 Jun 2021 16:09:51 +0000 Subject: [PATCH 051/143] Small Matrix: disable low performance default kernel --- kernel/generic/gemm_small_matrix_permit.c | 3 +++ kernel/generic/zgemm_small_matrix_permit.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/kernel/generic/gemm_small_matrix_permit.c b/kernel/generic/gemm_small_matrix_permit.c index 6e1ab1fc1..1ae6d2520 100644 --- a/kernel/generic/gemm_small_matrix_permit.c +++ b/kernel/generic/gemm_small_matrix_permit.c @@ -29,9 +29,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) { + return 0; +/* double MNK = (double) M * (double) N * (double) K; if (MNK <= 100.0*100.0*100.0) return 1; else return 0; +*/ } diff --git a/kernel/generic/zgemm_small_matrix_permit.c b/kernel/generic/zgemm_small_matrix_permit.c index 288937256..940ff5dc8 100644 --- a/kernel/generic/zgemm_small_matrix_permit.c +++ b/kernel/generic/zgemm_small_matrix_permit.c @@ -29,9 +29,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha0, FLOAT alpha1, FLOAT beta0, FLOAT beta1) { + return 0; +/* double MNK = (double) M * (double) N * (double) K; if (MNK <= 100.0*100.0*100.0) return 1; else return 0; +*/ } From 93c8bafff56052534554e3a47e56552c97217228 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Aug 2021 10:45:45 +0200 Subject: [PATCH 052/143] Update Travis badge in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d7e0d60a7..88a5a5035 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) +Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.com/xianyi/OpenBLAS) AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) From 478d1086c11f28903395bd13050dbca62aec81ef Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 4 Aug 2021 03:12:41 +0000 Subject: [PATCH 053/143] Small Matrix: support DYNAMIC_ARCH build --- common_c.h | 83 +++++++++++++++-------------- common_d.h | 23 ++++---- common_param.h | 119 ++++++++++++++++++++++++++++++++++++++++++ common_s.h | 23 ++++---- common_z.h | 83 +++++++++++++++-------------- interface/gemm.c | 50 ++++++++++-------- kernel/setparam-ref.c | 37 +++++++++++++ 7 files changed, 295 insertions(+), 123 deletions(-) diff --git a/common_c.h b/common_c.h index dc273eef0..6cff610bb 100644 --- a/common_c.h +++ b/common_c.h @@ -234,46 +234,6 @@ #define CGEMM_SMALL_MATRIX_PERMIT cgemm_small_matrix_permit -#define CGEMM_SMALL_KERNEL_NN cgemm_small_kernel_nn -#define CGEMM_SMALL_KERNEL_NT cgemm_small_kernel_nt -#define CGEMM_SMALL_KERNEL_NR cgemm_small_kernel_nr -#define CGEMM_SMALL_KERNEL_NC cgemm_small_kernel_nc - -#define CGEMM_SMALL_KERNEL_TN cgemm_small_kernel_tn -#define CGEMM_SMALL_KERNEL_TT cgemm_small_kernel_tt -#define CGEMM_SMALL_KERNEL_TR cgemm_small_kernel_tr -#define CGEMM_SMALL_KERNEL_TC cgemm_small_kernel_tc - -#define CGEMM_SMALL_KERNEL_RN cgemm_small_kernel_rn -#define CGEMM_SMALL_KERNEL_RT cgemm_small_kernel_rt -#define CGEMM_SMALL_KERNEL_RR cgemm_small_kernel_rr -#define CGEMM_SMALL_KERNEL_RC cgemm_small_kernel_rc - -#define CGEMM_SMALL_KERNEL_CN cgemm_small_kernel_cn -#define CGEMM_SMALL_KERNEL_CT cgemm_small_kernel_ct -#define CGEMM_SMALL_KERNEL_CR cgemm_small_kernel_cr -#define CGEMM_SMALL_KERNEL_CC cgemm_small_kernel_cc - -#define CGEMM_SMALL_KERNEL_B0_NN cgemm_small_kernel_b0_nn -#define CGEMM_SMALL_KERNEL_B0_NT cgemm_small_kernel_b0_nt -#define CGEMM_SMALL_KERNEL_B0_NR cgemm_small_kernel_b0_nr -#define CGEMM_SMALL_KERNEL_B0_NC cgemm_small_kernel_b0_nc - -#define CGEMM_SMALL_KERNEL_B0_TN cgemm_small_kernel_b0_tn -#define CGEMM_SMALL_KERNEL_B0_TT cgemm_small_kernel_b0_tt -#define CGEMM_SMALL_KERNEL_B0_TR cgemm_small_kernel_b0_tr -#define CGEMM_SMALL_KERNEL_B0_TC cgemm_small_kernel_b0_tc - -#define CGEMM_SMALL_KERNEL_B0_RN cgemm_small_kernel_b0_rn -#define CGEMM_SMALL_KERNEL_B0_RT cgemm_small_kernel_b0_rt -#define CGEMM_SMALL_KERNEL_B0_RR cgemm_small_kernel_b0_rr -#define CGEMM_SMALL_KERNEL_B0_RC cgemm_small_kernel_b0_rc - -#define CGEMM_SMALL_KERNEL_B0_CN cgemm_small_kernel_b0_cn -#define CGEMM_SMALL_KERNEL_B0_CT cgemm_small_kernel_b0_ct -#define CGEMM_SMALL_KERNEL_B0_CR cgemm_small_kernel_b0_cr -#define CGEMM_SMALL_KERNEL_B0_CC cgemm_small_kernel_b0_cc - #else #define CAMAX_K gotoblas -> camax_k @@ -468,8 +428,51 @@ #define CGEADD_K gotoblas -> cgeadd_k +#define CGEMM_SMALL_MATRIX_PERMIT gotoblas -> cgemm_small_matrix_permit + #endif +#define CGEMM_SMALL_KERNEL_NN FUNC_OFFSET(cgemm_small_kernel_nn) +#define CGEMM_SMALL_KERNEL_NT FUNC_OFFSET(cgemm_small_kernel_nt) +#define CGEMM_SMALL_KERNEL_NR FUNC_OFFSET(cgemm_small_kernel_nr) +#define CGEMM_SMALL_KERNEL_NC FUNC_OFFSET(cgemm_small_kernel_nc) + +#define CGEMM_SMALL_KERNEL_TN FUNC_OFFSET(cgemm_small_kernel_tn) +#define CGEMM_SMALL_KERNEL_TT FUNC_OFFSET(cgemm_small_kernel_tt) +#define CGEMM_SMALL_KERNEL_TR FUNC_OFFSET(cgemm_small_kernel_tr) +#define CGEMM_SMALL_KERNEL_TC FUNC_OFFSET(cgemm_small_kernel_tc) + +#define CGEMM_SMALL_KERNEL_RN FUNC_OFFSET(cgemm_small_kernel_rn) +#define CGEMM_SMALL_KERNEL_RT FUNC_OFFSET(cgemm_small_kernel_rt) +#define CGEMM_SMALL_KERNEL_RR FUNC_OFFSET(cgemm_small_kernel_rr) +#define CGEMM_SMALL_KERNEL_RC FUNC_OFFSET(cgemm_small_kernel_rc) + +#define CGEMM_SMALL_KERNEL_CN FUNC_OFFSET(cgemm_small_kernel_cn) +#define CGEMM_SMALL_KERNEL_CT FUNC_OFFSET(cgemm_small_kernel_ct) +#define CGEMM_SMALL_KERNEL_CR FUNC_OFFSET(cgemm_small_kernel_cr) +#define CGEMM_SMALL_KERNEL_CC FUNC_OFFSET(cgemm_small_kernel_cc) + +#define CGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(cgemm_small_kernel_b0_nn) +#define CGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(cgemm_small_kernel_b0_nt) +#define CGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(cgemm_small_kernel_b0_nr) +#define CGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(cgemm_small_kernel_b0_nc) + +#define CGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(cgemm_small_kernel_b0_tn) +#define CGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(cgemm_small_kernel_b0_tt) +#define CGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(cgemm_small_kernel_b0_tr) +#define CGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(cgemm_small_kernel_b0_tc) + +#define CGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(cgemm_small_kernel_b0_rn) +#define CGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(cgemm_small_kernel_b0_rt) +#define CGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(cgemm_small_kernel_b0_rr) +#define CGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(cgemm_small_kernel_b0_rc) + +#define CGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(cgemm_small_kernel_b0_cn) +#define CGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(cgemm_small_kernel_b0_ct) +#define CGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(cgemm_small_kernel_b0_cr) +#define CGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(cgemm_small_kernel_b0_cc) + + #define CGEMM_NN cgemm_nn #define CGEMM_CN cgemm_cn #define CGEMM_TN cgemm_tn diff --git a/common_d.h b/common_d.h index bb85f1232..6f4bb2ded 100644 --- a/common_d.h +++ b/common_d.h @@ -159,16 +159,6 @@ #define DGEMM_SMALL_MATRIX_PERMIT dgemm_small_matrix_permit -#define DGEMM_SMALL_KERNEL_NN dgemm_small_kernel_nn -#define DGEMM_SMALL_KERNEL_NT dgemm_small_kernel_nt -#define DGEMM_SMALL_KERNEL_TN dgemm_small_kernel_tn -#define DGEMM_SMALL_KERNEL_TT dgemm_small_kernel_tt - -#define DGEMM_SMALL_KERNEL_B0_NN dgemm_small_kernel_b0_nn -#define DGEMM_SMALL_KERNEL_B0_NT dgemm_small_kernel_b0_nt -#define DGEMM_SMALL_KERNEL_B0_TN dgemm_small_kernel_b0_tn -#define DGEMM_SMALL_KERNEL_B0_TT dgemm_small_kernel_b0_tt - #else #define DAMAX_K gotoblas -> damax_k @@ -293,8 +283,21 @@ #define DGEADD_K gotoblas -> dgeadd_k +#define DGEMM_SMALL_MATRIX_PERMIT gotoblas -> dgemm_small_matrix_permit + #endif +#define DGEMM_SMALL_KERNEL_NN FUNC_OFFSET(dgemm_small_kernel_nn) +#define DGEMM_SMALL_KERNEL_NT FUNC_OFFSET(dgemm_small_kernel_nt) +#define DGEMM_SMALL_KERNEL_TN FUNC_OFFSET(dgemm_small_kernel_tn) +#define DGEMM_SMALL_KERNEL_TT FUNC_OFFSET(dgemm_small_kernel_tt) + +#define DGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(dgemm_small_kernel_b0_nn) +#define DGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(dgemm_small_kernel_b0_nt) +#define DGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(dgemm_small_kernel_b0_tn) +#define DGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(dgemm_small_kernel_b0_tt) + + #define DGEMM_NN dgemm_nn #define DGEMM_CN dgemm_tn #define DGEMM_TN dgemm_tn diff --git a/common_param.h b/common_param.h index 3e3ae06f8..7e8bea4fe 100644 --- a/common_param.h +++ b/common_param.h @@ -207,6 +207,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); #endif #ifdef BUILD_SINGLE +#ifdef SMALL_MATRIX_OPT + int (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); + + int (*sgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + + int (*sgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +#endif + int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -314,6 +328,19 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); #endif #ifdef BUILD_DOUBLE +#ifdef SMALL_MATRIX_OPT + int (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta); + + int (*dgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + + int (*dgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +#endif int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -513,6 +540,50 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); +#ifdef SMALL_MATRIX_OPT + int (*cgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1); + + int (*cgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +#endif + int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -679,6 +750,50 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); +#ifdef SMALL_MATRIX_OPT + int (*zgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1); + + int (*zgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +#endif + int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -1069,6 +1184,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); extern gotoblas_t *gotoblas; +#define FUNC_OFFSET(func) (size_t)(&((gotoblas_t *)NULL)->func) + #define DTB_ENTRIES gotoblas -> dtb_entries #define GEMM_OFFSET_A gotoblas -> offsetA #define GEMM_OFFSET_B gotoblas -> offsetB @@ -1174,6 +1291,8 @@ extern gotoblas_t *gotoblas; #else +#define FUNC_OFFSET(func) (size_t)(func) + #define DTB_ENTRIES DTB_DEFAULT_ENTRIES #define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A diff --git a/common_s.h b/common_s.h index 5851014cf..fdd80b62f 100644 --- a/common_s.h +++ b/common_s.h @@ -166,16 +166,6 @@ #define SGEMM_SMALL_MATRIX_PERMIT sgemm_small_matrix_permit -#define SGEMM_SMALL_KERNEL_NN sgemm_small_kernel_nn -#define SGEMM_SMALL_KERNEL_NT sgemm_small_kernel_nt -#define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn -#define SGEMM_SMALL_KERNEL_TT sgemm_small_kernel_tt - -#define SGEMM_SMALL_KERNEL_B0_NN sgemm_small_kernel_b0_nn -#define SGEMM_SMALL_KERNEL_B0_NT sgemm_small_kernel_b0_nt -#define SGEMM_SMALL_KERNEL_B0_TN sgemm_small_kernel_b0_tn -#define SGEMM_SMALL_KERNEL_B0_TT sgemm_small_kernel_b0_tt - #else #define SAMAX_K gotoblas -> samax_k @@ -311,8 +301,21 @@ #define SGEADD_K gotoblas -> sgeadd_k +#define SGEMM_SMALL_MATRIX_PERMIT gotoblas -> sgemm_small_matrix_permit + #endif +#define SGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sgemm_small_kernel_nn) +#define SGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sgemm_small_kernel_nt) +#define SGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sgemm_small_kernel_tn) +#define SGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sgemm_small_kernel_tt) + +#define SGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sgemm_small_kernel_b0_nn) +#define SGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sgemm_small_kernel_b0_nt) +#define SGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sgemm_small_kernel_b0_tn) +#define SGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sgemm_small_kernel_b0_tt) + + #define SGEMM_NN sgemm_nn #define SGEMM_CN sgemm_tn #define SGEMM_TN sgemm_tn diff --git a/common_z.h b/common_z.h index 6088260a1..c12d71b39 100644 --- a/common_z.h +++ b/common_z.h @@ -234,46 +234,6 @@ #define ZGEMM_SMALL_MATRIX_PERMIT zgemm_small_matrix_permit -#define ZGEMM_SMALL_KERNEL_NN zgemm_small_kernel_nn -#define ZGEMM_SMALL_KERNEL_NT zgemm_small_kernel_nt -#define ZGEMM_SMALL_KERNEL_NR zgemm_small_kernel_nr -#define ZGEMM_SMALL_KERNEL_NC zgemm_small_kernel_nc - -#define ZGEMM_SMALL_KERNEL_TN zgemm_small_kernel_tn -#define ZGEMM_SMALL_KERNEL_TT zgemm_small_kernel_tt -#define ZGEMM_SMALL_KERNEL_TR zgemm_small_kernel_tr -#define ZGEMM_SMALL_KERNEL_TC zgemm_small_kernel_tc - -#define ZGEMM_SMALL_KERNEL_RN zgemm_small_kernel_rn -#define ZGEMM_SMALL_KERNEL_RT zgemm_small_kernel_rt -#define ZGEMM_SMALL_KERNEL_RR zgemm_small_kernel_rr -#define ZGEMM_SMALL_KERNEL_RC zgemm_small_kernel_rc - -#define ZGEMM_SMALL_KERNEL_CN zgemm_small_kernel_cn -#define ZGEMM_SMALL_KERNEL_CT zgemm_small_kernel_ct -#define ZGEMM_SMALL_KERNEL_CR zgemm_small_kernel_cr -#define ZGEMM_SMALL_KERNEL_CC zgemm_small_kernel_cc - -#define ZGEMM_SMALL_KERNEL_B0_NN zgemm_small_kernel_b0_nn -#define ZGEMM_SMALL_KERNEL_B0_NT zgemm_small_kernel_b0_nt -#define ZGEMM_SMALL_KERNEL_B0_NR zgemm_small_kernel_b0_nr -#define ZGEMM_SMALL_KERNEL_B0_NC zgemm_small_kernel_b0_nc - -#define ZGEMM_SMALL_KERNEL_B0_TN zgemm_small_kernel_b0_tn -#define ZGEMM_SMALL_KERNEL_B0_TT zgemm_small_kernel_b0_tt -#define ZGEMM_SMALL_KERNEL_B0_TR zgemm_small_kernel_b0_tr -#define ZGEMM_SMALL_KERNEL_B0_TC zgemm_small_kernel_b0_tc - -#define ZGEMM_SMALL_KERNEL_B0_RN zgemm_small_kernel_b0_rn -#define ZGEMM_SMALL_KERNEL_B0_RT zgemm_small_kernel_b0_rt -#define ZGEMM_SMALL_KERNEL_B0_RR zgemm_small_kernel_b0_rr -#define ZGEMM_SMALL_KERNEL_B0_RC zgemm_small_kernel_b0_rc - -#define ZGEMM_SMALL_KERNEL_B0_CN zgemm_small_kernel_b0_cn -#define ZGEMM_SMALL_KERNEL_B0_CT zgemm_small_kernel_b0_ct -#define ZGEMM_SMALL_KERNEL_B0_CR zgemm_small_kernel_b0_cr -#define ZGEMM_SMALL_KERNEL_B0_CC zgemm_small_kernel_b0_cc - #else #define ZAMAX_K gotoblas -> zamax_k @@ -468,8 +428,51 @@ #define ZGEADD_K gotoblas -> zgeadd_k +#define ZGEMM_SMALL_MATRIX_PERMIT gotoblas -> zgemm_small_matrix_permit + #endif +#define ZGEMM_SMALL_KERNEL_NN FUNC_OFFSET(zgemm_small_kernel_nn) +#define ZGEMM_SMALL_KERNEL_NT FUNC_OFFSET(zgemm_small_kernel_nt) +#define ZGEMM_SMALL_KERNEL_NR FUNC_OFFSET(zgemm_small_kernel_nr) +#define ZGEMM_SMALL_KERNEL_NC FUNC_OFFSET(zgemm_small_kernel_nc) + +#define ZGEMM_SMALL_KERNEL_TN FUNC_OFFSET(zgemm_small_kernel_tn) +#define ZGEMM_SMALL_KERNEL_TT FUNC_OFFSET(zgemm_small_kernel_tt) +#define ZGEMM_SMALL_KERNEL_TR FUNC_OFFSET(zgemm_small_kernel_tr) +#define ZGEMM_SMALL_KERNEL_TC FUNC_OFFSET(zgemm_small_kernel_tc) + +#define ZGEMM_SMALL_KERNEL_RN FUNC_OFFSET(zgemm_small_kernel_rn) +#define ZGEMM_SMALL_KERNEL_RT FUNC_OFFSET(zgemm_small_kernel_rt) +#define ZGEMM_SMALL_KERNEL_RR FUNC_OFFSET(zgemm_small_kernel_rr) +#define ZGEMM_SMALL_KERNEL_RC FUNC_OFFSET(zgemm_small_kernel_rc) + +#define ZGEMM_SMALL_KERNEL_CN FUNC_OFFSET(zgemm_small_kernel_cn) +#define ZGEMM_SMALL_KERNEL_CT FUNC_OFFSET(zgemm_small_kernel_ct) +#define ZGEMM_SMALL_KERNEL_CR FUNC_OFFSET(zgemm_small_kernel_cr) +#define ZGEMM_SMALL_KERNEL_CC FUNC_OFFSET(zgemm_small_kernel_cc) + +#define ZGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(zgemm_small_kernel_b0_nn) +#define ZGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(zgemm_small_kernel_b0_nt) +#define ZGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(zgemm_small_kernel_b0_nr) +#define ZGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(zgemm_small_kernel_b0_nc) + +#define ZGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(zgemm_small_kernel_b0_tn) +#define ZGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(zgemm_small_kernel_b0_tt) +#define ZGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(zgemm_small_kernel_b0_tr) +#define ZGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(zgemm_small_kernel_b0_tc) + +#define ZGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(zgemm_small_kernel_b0_rn) +#define ZGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(zgemm_small_kernel_b0_rt) +#define ZGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(zgemm_small_kernel_b0_rr) +#define ZGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(zgemm_small_kernel_b0_rc) + +#define ZGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(zgemm_small_kernel_b0_cn) +#define ZGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(zgemm_small_kernel_b0_ct) +#define ZGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(zgemm_small_kernel_b0_cr) +#define ZGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(zgemm_small_kernel_b0_cc) + + #define ZGEMM_NN zgemm_nn #define ZGEMM_CN zgemm_cn #define ZGEMM_TN zgemm_tn diff --git a/interface/gemm.c b/interface/gemm.c index ad8780668..f4b9f1537 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -106,25 +106,34 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B }; #ifdef SMALL_MATRIX_OPT +#ifndef DYNAMIC_ARCH +#define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx])) +#else +#define SMALL_KERNEL_ADDR(table, idx) ((void *)(*(uintptr_t *)((char *)gotoblas + (size_t)(table[idx])))) +#endif + #ifndef COMPLEX -static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { +static size_t gemm_small_kernel[] = { #ifndef GEMM3M - GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL, - GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL, + GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0, + GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0, #endif }; -static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { + +static size_t gemm_small_kernel_b0[] = { #ifndef GEMM3M - GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, NULL, NULL, - GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, NULL, NULL, + GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0, + GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0, #endif }; +#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx)) +#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx)) #else -static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG) = { +static size_t zgemm_small_kernel[] = { #ifndef GEMM3M GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, @@ -133,7 +142,7 @@ static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLO #endif }; -static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { +static size_t zgemm_small_kernel_b0[] = { #ifndef GEMM3M GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, @@ -141,6 +150,9 @@ static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLA GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, #endif }; + +#define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx)) +#define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx)) #endif #endif @@ -163,7 +175,7 @@ void NAME(char *TRANSA, char *TRANSB, IFLOAT *buffer; IFLOAT *sa, *sb; -#if defined (SMP) || defined(SMALL_MATRIX_OPT) +#ifdef SMP double MNK; #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX @@ -287,11 +299,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS XFLOAT *buffer; XFLOAT *sa, *sb; -#if defined (SMP) || defined(SMALL_MATRIX_OPT) - double MNK; -#endif - #ifdef SMP + double MNK; #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX #ifdef XDOUBLE @@ -459,32 +468,27 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FUNCTION_PROFILE_START(); -#if defined(SMP) || defined(SMALL_MATRIX_OPT) - MNK = (double) args.m * (double) args.n * (double) args.k; -#endif - #ifdef SMALL_MATRIX_OPT #if !defined(COMPLEX) if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){ if(*(FLOAT *)(args.beta) == 0.0){ - (gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); + (GEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); }else{ - (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); + (GEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); } return; } #else if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, alpha[0], alpha[1], beta[0], beta[1])){ if(beta[0] == 0.0 && beta[1] == 0.0){ - (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc); + (ZGEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc); }else{ - (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc); + (ZGEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc); } return; } #endif #endif - buffer = (XFLOAT *)blas_memory_alloc(0); @@ -497,7 +501,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS mode |= (transb << BLAS_TRANSB_SHIFT); #endif - + MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) args.nthreads = 1; else diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 1e846a61c..f303d0dc6 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -171,6 +171,14 @@ gotoblas_t TABLE_NAME = { sgemm_oncopyTS, sgemm_otcopyTS, #endif +#if BUILD_SINGLE == 1 +#ifdef SMALL_MATRIX_OPT + sgemm_small_matrix_permitTS, + sgemm_small_kernel_nnTS, sgemm_small_kernel_ntTS, sgemm_small_kernel_tnTS, sgemm_small_kernel_ttTS, + sgemm_small_kernel_b0_nnTS, sgemm_small_kernel_b0_ntTS, sgemm_small_kernel_b0_tnTS, sgemm_small_kernel_b0_ttTS, +#endif +#endif + #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N @@ -257,6 +265,11 @@ gotoblas_t TABLE_NAME = { #endif #if (BUILD_DOUBLE==1) +#ifdef SMALL_MATRIX_OPT + dgemm_small_matrix_permitTS, + dgemm_small_kernel_nnTS, dgemm_small_kernel_ntTS, dgemm_small_kernel_tnTS, dgemm_small_kernel_ttTS, + dgemm_small_kernel_b0_nnTS, dgemm_small_kernel_b0_ntTS, dgemm_small_kernel_b0_tnTS, dgemm_small_kernel_b0_ttTS, +#endif dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS, @@ -389,6 +402,18 @@ gotoblas_t TABLE_NAME = { #endif cgemm_oncopyTS, cgemm_otcopyTS, +#ifdef SMALL_MATRIX_OPT + cgemm_small_matrix_permitTS, + cgemm_small_kernel_nnTS, cgemm_small_kernel_ntTS, cgemm_small_kernel_nrTS, cgemm_small_kernel_ncTS, + cgemm_small_kernel_tnTS, cgemm_small_kernel_ttTS, cgemm_small_kernel_trTS, cgemm_small_kernel_tcTS, + cgemm_small_kernel_rnTS, cgemm_small_kernel_rtTS, cgemm_small_kernel_rrTS, cgemm_small_kernel_rcTS, + cgemm_small_kernel_cnTS, cgemm_small_kernel_ctTS, cgemm_small_kernel_crTS, cgemm_small_kernel_ccTS, + cgemm_small_kernel_b0_nnTS, cgemm_small_kernel_b0_ntTS, cgemm_small_kernel_b0_nrTS, cgemm_small_kernel_b0_ncTS, + cgemm_small_kernel_b0_tnTS, cgemm_small_kernel_b0_ttTS, cgemm_small_kernel_b0_trTS, cgemm_small_kernel_b0_tcTS, + cgemm_small_kernel_b0_rnTS, cgemm_small_kernel_b0_rtTS, cgemm_small_kernel_b0_rrTS, cgemm_small_kernel_b0_rcTS, + cgemm_small_kernel_b0_cnTS, cgemm_small_kernel_b0_ctTS, cgemm_small_kernel_b0_crTS, cgemm_small_kernel_b0_ccTS, +#endif + ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS, ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS, @@ -533,6 +558,18 @@ gotoblas_t TABLE_NAME = { #endif zgemm_oncopyTS, zgemm_otcopyTS, +#ifdef SMALL_MATRIX_OPT + zgemm_small_matrix_permitTS, + zgemm_small_kernel_nnTS, zgemm_small_kernel_ntTS, zgemm_small_kernel_nrTS, zgemm_small_kernel_ncTS, + zgemm_small_kernel_tnTS, zgemm_small_kernel_ttTS, zgemm_small_kernel_trTS, zgemm_small_kernel_tcTS, + zgemm_small_kernel_rnTS, zgemm_small_kernel_rtTS, zgemm_small_kernel_rrTS, zgemm_small_kernel_rcTS, + zgemm_small_kernel_cnTS, zgemm_small_kernel_ctTS, zgemm_small_kernel_crTS, zgemm_small_kernel_ccTS, + zgemm_small_kernel_b0_nnTS, zgemm_small_kernel_b0_ntTS, zgemm_small_kernel_b0_nrTS, zgemm_small_kernel_b0_ncTS, + zgemm_small_kernel_b0_tnTS, zgemm_small_kernel_b0_ttTS, zgemm_small_kernel_b0_trTS, zgemm_small_kernel_b0_tcTS, + zgemm_small_kernel_b0_rnTS, zgemm_small_kernel_b0_rtTS, zgemm_small_kernel_b0_rrTS, zgemm_small_kernel_b0_rcTS, + zgemm_small_kernel_b0_cnTS, zgemm_small_kernel_b0_ctTS, zgemm_small_kernel_b0_crTS, zgemm_small_kernel_b0_ccTS, +#endif + ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS, ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS, From fee5abd84bf01aba7a2223f7264fcc7da66d1b20 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 4 Aug 2021 08:50:15 +0000 Subject: [PATCH 054/143] Small Matrix: support cmake build --- cmake/system.cmake | 4 ++ kernel/CMakeLists.txt | 110 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index f8bd6678e..e51dc1fdc 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -258,6 +258,10 @@ if (NEED_PIC) endif() endif () +if (SMALL_MATRIX_OPT) + set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT") +endif () + if (DYNAMIC_ARCH) if (X86 OR X86_64 OR ARM64 OR PPC) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index f0793bdef..769a73b91 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -458,7 +458,117 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type}) + if (NOT DEFINED ${float_char}GEMM_SMALL_M_PERMIT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_M_PERMIT ../generic/zgemm_small_matrix_permit.c) + else () + set(${float_char}GEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_NN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_NN ../generic/zgemm_small_matrix_kernel_nn.c) + else () + set(${float_char}GEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_NT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_NT ../generic/zgemm_small_matrix_kernel_nt.c) + else () + set(${float_char}GEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_TN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_TN ../generic/zgemm_small_matrix_kernel_tn.c) + else () + set(${float_char}GEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_TT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_TT ../generic/zgemm_small_matrix_kernel_tt.c) + else () + set(${float_char}GEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_b0_nn.c) + else () + set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_b0_nn.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_b0_nt.c) + else () + set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_b0_nt.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_b0_tn.c) + else () + set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_b0_tn.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_b0_tt.c) + else () + set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_b0_tt.c) + endif () + endif () + if (SMALL_MATRIX_OPT) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false ${float_type}) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NN" "gemm_small_kernel_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NR" "gemm_small_kernel_nr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RN" "gemm_small_kernel_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RR" "gemm_small_kernel_rr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NT" "gemm_small_kernel_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NC" "gemm_small_kernel_nc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RT" "gemm_small_kernel_rt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RC" "gemm_small_kernel_rc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TN" "gemm_small_kernel_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TR" "gemm_small_kernel_tr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CN" "gemm_small_kernel_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CR" "gemm_small_kernel_cr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TT" "gemm_small_kernel_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR" "gemm_small_kernel_b0_nr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN" "gemm_small_kernel_b0_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR" "gemm_small_kernel_b0_rr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC" "gemm_small_kernel_b0_nc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT" "gemm_small_kernel_b0_rt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC" "gemm_small_kernel_b0_rc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR" "gemm_small_kernel_b0_tr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN" "gemm_small_kernel_b0_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR" "gemm_small_kernel_b0_cr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC" "gemm_small_kernel_b0_tc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT" "gemm_small_kernel_b0_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC" "gemm_small_kernel_b0_cc" false "" "" false ${float_type}) + + else () + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) + endif () + endif () if (NOT DEFINED ${float_char}OMATCOPY_CN) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") From aa50185647ba6966dcdb731372af2ecd5ae3b1d4 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 5 Aug 2021 02:45:53 +0000 Subject: [PATCH 055/143] Small Matrix: better handle with GEMM3M marco --- interface/gemm.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index f4b9f1537..775f654c3 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -105,6 +105,7 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #endif }; +#ifndef GEMM3M #ifdef SMALL_MATRIX_OPT #ifndef DYNAMIC_ARCH #define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx])) @@ -115,18 +116,14 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #ifndef COMPLEX static size_t gemm_small_kernel[] = { -#ifndef GEMM3M GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0, GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0, -#endif }; static size_t gemm_small_kernel_b0[] = { -#ifndef GEMM3M GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0, GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0, -#endif }; #define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx)) @@ -134,27 +131,24 @@ static size_t gemm_small_kernel_b0[] = { #else static size_t zgemm_small_kernel[] = { -#ifndef GEMM3M GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR, GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC, -#endif }; static size_t zgemm_small_kernel_b0[] = { -#ifndef GEMM3M GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR, GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, -#endif }; #define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx)) #define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx)) #endif #endif +#endif #ifndef CBLAS @@ -468,6 +462,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FUNCTION_PROFILE_START(); +#ifndef GEMM3M #ifdef SMALL_MATRIX_OPT #if !defined(COMPLEX) if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){ @@ -488,6 +483,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS return; } #endif +#endif #endif buffer = (XFLOAT *)blas_memory_alloc(0); From 76ea8db4da1a651bb4de744162de1ecfc6762e7c Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 5 Aug 2021 02:57:58 +0000 Subject: [PATCH 056/143] Small Matrix: enable by default for x86_64 arch If no customized GEMM_SMALL_M_PERMIT kernel defined, it will just by pass to normal path. --- Makefile.system | 3 +++ cmake/system.cmake | 3 +++ 2 files changed, 6 insertions(+) diff --git a/Makefile.system b/Makefile.system index 20d8d2f2a..20db80d07 100644 --- a/Makefile.system +++ b/Makefile.system @@ -245,6 +245,9 @@ ONLY_CBLAS = 0 endif #For small matrix optimization +ifeq ($(ARCH), x86_64) +SMALL_MATRIX_OPT = 1 +endif ifeq ($(SMALL_MATRIX_OPT), 1) CCOMMON_OPT += -DSMALL_MATRIX_OPT endif diff --git a/cmake/system.cmake b/cmake/system.cmake index e51dc1fdc..7d2672998 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -258,6 +258,9 @@ if (NEED_PIC) endif() endif () +if (X86_64) + set(SMALL_MATRIX_OPT TRUE) +endif () if (SMALL_MATRIX_OPT) set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT") endif () From 5d86becdaec262e8a2869ce909d94bec881fbfb6 Mon Sep 17 00:00:00 2001 From: "Chen, Guobing" Date: Thu, 5 Aug 2021 11:11:14 +0800 Subject: [PATCH 057/143] Add all SBGEMM kernels for IA AVX512-BF16 based platforms Added all SBGEMM kernels including NN/NT/TN/TT for both ColMajor and RowMajor, based on AVX512-BF16 ISA set on IA. Signed-off-by: Chen, Guobing --- kernel/x86_64/bf16_common_macros.h | 52 + .../x86_64/sbgemm_block_microk_cooperlake.c | 2024 ++++++++++++++--- .../sbgemm_microk_cooperlake_template.c | 1737 +++++++++++--- 3 files changed, 3268 insertions(+), 545 deletions(-) diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h index 1014ecc4d..78db7abb2 100644 --- a/kernel/x86_64/bf16_common_macros.h +++ b/kernel/x86_64/bf16_common_macros.h @@ -29,6 +29,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#define _MM512_BROADCASTD_EPI32(addr, zmm) \ + __asm__ ("vpbroadcastd (%1), %0;" \ + : "=v" (zmm) \ + : "r" (addr) ) + +#define PREFETCH_T0(addr) \ + __asm__ ("prefetcht0 (%0);" \ + : \ + : "r" (addr) ) + #define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512) \ reg256##_0 = _mm512_castps512_ps256(reg512##_0); \ reg256##_1 = _mm512_castps512_ps256(reg512##_1); @@ -721,6 +731,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm_mask_storeu_ps(targetAddr, mask, regResult); +/* Store 16 (result + y) to y +*/ +#define STORE16_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr) \ + regResult = _mm512_add_ps(regResult, _mm512_loadu_ps(targetAddr)); \ + _mm512_storeu_ps(targetAddr, regResult); + + +/* Masked store 16 (result + y) to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask) \ + regResult = _mm512_add_ps(regResult, _mm512_maskz_loadu_ps(mask, targetAddr)); \ + _mm512_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 8 (result + y) to y +*/ +#define STORE8_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr) \ + regResult = _mm256_add_ps(regResult, _mm256_loadu_ps(targetAddr)); \ + _mm256_storeu_ps(targetAddr, regResult); + + +/* Masked store 8 (result + y) to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask) \ + regResult = _mm256_add_ps(regResult, _mm256_maskz_loadu_ps(mask, targetAddr)); \ + _mm256_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 4 (result + y) to y +*/ +#define STORE4_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr) \ + regResult = _mm_add_ps(regResult, _mm_loadu_ps(targetAddr)); \ + _mm_storeu_ps(targetAddr, regResult); + + +/* Masked store 4 (result + y) to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask) \ + regResult = _mm_add_ps(regResult, _mm_maskz_loadu_ps(mask, targetAddr)); \ + _mm_mask_storeu_ps(targetAddr, mask, regResult); + + /* Store 16 (alpha * result) to y */ #define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \ diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c index 2376fed02..147c5ebdd 100644 --- a/kernel/x86_64/sbgemm_block_microk_cooperlake.c +++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c @@ -1,4 +1,4 @@ -#include "sbgemm.h" +//#include "sbgemm.h" #include // Walk around those intrinsics that missed by compiler @@ -7,420 +7,1878 @@ #define MM256_STOREU_EPI16(addr, reg) \ _mm256_mask_storeu_epi16((addr), ~0, (reg)) -#include -void print_block(BLASLONG m, BLASLONG n, bfloat16 * mat) -{ - printf("---- BLOCK %ld x %ld ----\n", m, n); - for (BLASLONG i=0; i> (32-m)); __m512i array512_0, array512_1, array512_2, array512_3; - BLASLONG idx_src_base0, idx_src_base1; - BLASLONG idx_target_base0, idx_target_base1; + bfloat16 * src_addr0, * src_addr1; + bfloat16 * dst_addr0, * dst_addr1; BLASLONG LDA_2x = 2*lda; BLASLONG BF16_BLOCK_T_M_2x = 2*32; - idx_src_base0 = 0; - idx_src_base1 = lda; - idx_target_base0 = 0; - idx_target_base1 = 32; - for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { - array512_0 = _mm512_loadu_si512(&A[idx_src_base0]); - array512_1 = _mm512_loadu_si512(&A[idx_src_base1]); - array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1); - array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1); - _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); - _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); - - idx_src_base0 += LDA_2x; - idx_src_base1 += LDA_2x; - idx_target_base0 += BF16_BLOCK_T_M_2x; - idx_target_base1 += BF16_BLOCK_T_M_2x; - } - - if (tag_k_2x != k) { - __m512i ZERO512 = _mm512_setzero_si512(); - array512_0 = _mm512_loadu_si512(&A[idx_src_base0]); - array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512); - array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512); - _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); - _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); - } - -#ifdef DEBUG_PROFILE - print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); -#endif -} - -void COL_MAJOR_INCOPY_KERNEL_Kx32m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) -{ - BLASLONG tag_k_2x = k & (~1); - unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-m)); - __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); - - __m512i array512_0, array512_1, array512_2, array512_3; - BLASLONG idx_src_base0, idx_src_base1; - BLASLONG idx_target_base0, idx_target_base1; + src_addr0 = A; + src_addr1 = A + lda; + dst_addr0 = block_A; + dst_addr1 = block_A + 32; - BLASLONG LDA_2x = 2*lda; - BLASLONG BF16_BLOCK_T_M_2x = 2*32; - idx_src_base0 = 0; - idx_src_base1 = lda; - idx_target_base0 = 0; - idx_target_base1 = 32; for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { - array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); - array512_1 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1); array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1); array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1); - _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); - _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); - idx_src_base0 += LDA_2x; - idx_src_base1 += LDA_2x; - idx_target_base0 += BF16_BLOCK_T_M_2x; - idx_target_base1 += BF16_BLOCK_T_M_2x; + src_addr0 += LDA_2x; + src_addr1 += LDA_2x; + dst_addr0 += BF16_BLOCK_T_M_2x; + dst_addr1 += BF16_BLOCK_T_M_2x; } if (tag_k_2x != k) { __m512i ZERO512 = _mm512_setzero_si512(); - array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0); array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512); array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512); - _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); - _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); } - -#ifdef DEBUG_PROFILE - print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); -#endif } +// INCOPY Kernel, 0> (16-m)); __m256i array256_0, array256_1, array256_2, array256_3; - BLASLONG idx_src_base0, idx_src_base1; - BLASLONG idx_target_base0; + bfloat16 * src_addr0, * src_addr1; + bfloat16 * dst_addr0; BLASLONG LDA_2x = 2*lda; - idx_src_base0 = 0; - idx_src_base1 = lda; - idx_target_base0 = 0; + + src_addr0 = A; + src_addr1 = A + lda; + dst_addr0 = block_A; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { - array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]); - array256_1 = MM256_LOADU_EPI16(&A[idx_src_base1]); + array256_0 = _mm256_maskz_loadu_epi16(tail_mask, src_addr0); + array256_1 = _mm256_maskz_loadu_epi16(tail_mask, src_addr1); array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); // Store in one row of block_B - MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); - MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + MM256_STOREU_EPI16(dst_addr0, array256_2); + MM256_STOREU_EPI16(dst_addr0+16, array256_3); - idx_src_base0 += LDA_2x; - idx_src_base1 += LDA_2x; - idx_target_base0 += 32; + src_addr0 += LDA_2x; + src_addr1 += LDA_2x; + dst_addr0 += 32; } if (tag_k_2x != k) { __m256i ZERO256 = _mm256_setzero_si256(); - array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]); + array256_0 = _mm256_maskz_loadu_epi16(tail_mask, src_addr0); array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); // Store in one row of block_B - MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); - MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + MM256_STOREU_EPI16(dst_addr0, array256_2); + MM256_STOREU_EPI16(dst_addr0+16, array256_3); } +} -#ifdef DEBUG_PROFILE - print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); -#endif +// K=32, M=16 +void COL_MAJOR_ITCOPY_KERNEL_32x16(bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0, * dst_addr1; + + BLASLONG LDA_4x = lda*4; + + src_addr0 = A; + src_addr1 = A + lda; + src_addr2 = A + lda*2; + src_addr3 = A + lda*3; + dst_addr0 = block_A; + dst_addr1 = block_A + 32*8; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3; + __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3; + __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3; + + __m512i M512_EPI64_2 = _mm512_set1_epi64(2); + __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2); + + // Load and preprocess 1st 4 rows + array512_way0_0 = _mm512_loadu_si512(src_addr0); + array512_way0_1 = _mm512_loadu_si512(src_addr1); + array512_way0_2 = _mm512_loadu_si512(src_addr2); + array512_way0_3 = _mm512_loadu_si512(src_addr3); + array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1); + array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1); + array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3); + array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3); + array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 2nd 4 rows + array512_way1_0 = _mm512_loadu_si512(src_addr0); + array512_way1_1 = _mm512_loadu_si512(src_addr1); + array512_way1_2 = _mm512_loadu_si512(src_addr2); + array512_way1_3 = _mm512_loadu_si512(src_addr3); + array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1); + array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1); + array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3); + array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3); + array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 3rd 4 rows + array512_way2_0 = _mm512_loadu_si512(src_addr0); + array512_way2_1 = _mm512_loadu_si512(src_addr1); + array512_way2_2 = _mm512_loadu_si512(src_addr2); + array512_way2_3 = _mm512_loadu_si512(src_addr3); + array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1); + array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1); + array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3); + array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3); + array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 4th 4 rows + array512_way3_0 = _mm512_loadu_si512(src_addr0); + array512_way3_1 = _mm512_loadu_si512(src_addr1); + array512_way3_2 = _mm512_loadu_si512(src_addr2); + array512_way3_3 = _mm512_loadu_si512(src_addr3); + array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1); + array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1); + array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3); + array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3); + array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Compose and store the 0/1 and 16/17 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 2/3 and 18/19 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 4/5 and 20/21 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 6/7 and 22/23 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 8/9 and 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 10/11 and 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 12/13 and 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 14/15 and 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); } -void COL_MAJOR_INCOPY_KERNEL_Kx16m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +// K=Any number but will be processed based on 32, M=32 +void COL_MAJOR_ITCOPY_KERNEL_Kx32(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) { - BLASLONG tag_k_2x = k & (~1); - unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); - __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0, * dst_addr1; - __m256i array256_0, array256_1, array256_2, array256_3; + BLASLONG tag_k_32x = k & (~31); - BLASLONG idx_src_base0, idx_src_base1; - BLASLONG idx_target_base0; + BLASLONG LDA_4x = lda*4; + BLASLONG LDA_8x = lda*8; + BLASLONG LDA_12x = lda*12; + BLASLONG LDA_16x = lda*16; - BLASLONG LDA_2x = 2*lda; - idx_src_base0 = 0; - idx_src_base1 = lda; - idx_target_base0 = 0; - for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { - array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); - array256_1 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]); - array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); - array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); - // Store in one row of block_B - MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); - MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + src_addr0 = A; + src_addr1 = A + lda; + src_addr2 = A + lda*2; + src_addr3 = A + lda*3; + dst_addr0 = block_A; + dst_addr1 = block_A + 32*16; - idx_src_base0 += LDA_2x; - idx_src_base1 += LDA_2x; - idx_target_base0 += 32; + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3; + __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3; + __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3; + + __m512i M512_EPI64_2 = _mm512_set1_epi64(2); + __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2); + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + for (int i = 0; i < 2; i++) { + // Load and preprocess 1st 4 rows + array512_way0_0 = _mm512_loadu_si512(src_addr0+idx_k); + array512_way0_1 = _mm512_loadu_si512(src_addr1+idx_k); + array512_way0_2 = _mm512_loadu_si512(src_addr2+idx_k); + array512_way0_3 = _mm512_loadu_si512(src_addr3+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1); + array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1); + array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3); + array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3); + array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 2nd 4 rows + array512_way1_0 = _mm512_loadu_si512(src_addr0+LDA_4x+idx_k); + array512_way1_1 = _mm512_loadu_si512(src_addr1+LDA_4x+idx_k); + array512_way1_2 = _mm512_loadu_si512(src_addr2+LDA_4x+idx_k); + array512_way1_3 = _mm512_loadu_si512(src_addr3+LDA_4x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1); + array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1); + array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3); + array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3); + array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 3rd 4 rows + array512_way2_0 = _mm512_loadu_si512(src_addr0+LDA_8x+idx_k); + array512_way2_1 = _mm512_loadu_si512(src_addr1+LDA_8x+idx_k); + array512_way2_2 = _mm512_loadu_si512(src_addr2+LDA_8x+idx_k); + array512_way2_3 = _mm512_loadu_si512(src_addr3+LDA_8x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1); + array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1); + array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3); + array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3); + array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 4th 4 rows + array512_way3_0 = _mm512_loadu_si512(src_addr0+LDA_12x+idx_k); + array512_way3_1 = _mm512_loadu_si512(src_addr1+LDA_12x+idx_k); + array512_way3_2 = _mm512_loadu_si512(src_addr2+LDA_12x+idx_k); + array512_way3_3 = _mm512_loadu_si512(src_addr3+LDA_12x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1); + array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1); + array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3); + array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3); + array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Compose and store the 0/1 and 16/17 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 2/3 and 18/19 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 4/5 and 20/21 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 6/7 and 22/23 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 8/9 and 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 10/11 and 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 12/13 and 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 14/15 and 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + + src_addr0 += LDA_16x; + src_addr1 += LDA_16x; + src_addr2 += LDA_16x; + src_addr3 += LDA_16x; + dst_addr0 -= (64*7 - 32); + dst_addr1 -= (64*7 - 32); + } + src_addr0 -= (LDA_16x*2); + src_addr1 -= (LDA_16x*2); + src_addr2 -= (LDA_16x*2); + src_addr3 -= (LDA_16x*2); + dst_addr0 += (32*30); + dst_addr1 += (32*30); } - if (tag_k_2x != k) { - __m256i ZERO256 = _mm256_setzero_si256(); - array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); - array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); - array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); - // Store in one row of block_B - MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); - MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + if (tag_k_32x != k) { + int k_rem = k - tag_k_32x; + unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem)); + __m512i array512[16]; + + bfloat16 * dst_addr_tmp = dst_addr0; + + for (int i = 0; i < 2; i++) { + // Load and preprocess 1st 4 rows + array512[0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[0], array512[1]); + array512_1 = _mm512_unpackhi_epi32(array512[0], array512[1]); + array512_2 = _mm512_unpacklo_epi32(array512[2], array512[3]); + array512_3 = _mm512_unpackhi_epi32(array512[2], array512[3]); + array512[0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[3] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 2nd 4 rows + array512[4] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[5] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[6] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[7] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[4], array512[5]); + array512_1 = _mm512_unpackhi_epi32(array512[4], array512[5]); + array512_2 = _mm512_unpacklo_epi32(array512[6], array512[7]); + array512_3 = _mm512_unpackhi_epi32(array512[6], array512[7]); + array512[4] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[5] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[6] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[7] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 3rd 4 rows + array512[8] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[9] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[10] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[11] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[8], array512[9]); + array512_1 = _mm512_unpackhi_epi32(array512[8], array512[9]); + array512_2 = _mm512_unpacklo_epi32(array512[10], array512[11]); + array512_3 = _mm512_unpackhi_epi32(array512[10], array512[11]); + array512[8] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[9] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[10] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[11] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 4th 4 rows + array512[12] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[13] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[14] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[15] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[12], array512[13]); + array512_1 = _mm512_unpackhi_epi32(array512[12], array512[13]); + array512_2 = _mm512_unpacklo_epi32(array512[14], array512[15]); + array512_3 = _mm512_unpackhi_epi32(array512[14], array512[15]); + array512[12] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[13] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[14] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[15] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // array512_01_1617_0, array512_01_1617_1, array512_89_2425_0, array512_89_2425_1; + // Half-compose of 0/1, 16/17, 8/9, 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512[0], permute_lo_idx, array512[4]); + array512_1 = _mm512_permutex2var_epi64(array512[8], permute_lo_idx, array512[12]); + array512_2 = _mm512_permutex2var_epi64(array512[0], permute_hi_idx, array512[4]); + array512_3 = _mm512_permutex2var_epi64(array512[8], permute_hi_idx, array512[12]); + array512[0] = array512_0; // 1st 8 pairs of col 0/1, and 1st 8 pairs of col 16/17 + array512[4] = array512_1; // 2nd 8 pairs of col 0/1, and 2nd 8 pairs of col 16/17 + array512[8] = array512_2; // 1st 8 pairs of col 8/9, and 1st 8 pairs of col 24/25 + array512[12] = array512_3; // 2nd 8 pairs of col 8/9, and 2nd 8 pairs of col 24/25 + + // Half-compose of 2/3, 18/19, 10/11, 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512[1], permute_lo_idx, array512[5]); + array512_1 = _mm512_permutex2var_epi64(array512[9], permute_lo_idx, array512[13]); + array512_2 = _mm512_permutex2var_epi64(array512[1], permute_hi_idx, array512[5]); + array512_3 = _mm512_permutex2var_epi64(array512[9], permute_hi_idx, array512[13]); + array512[1] = array512_0; // 1st 8 pairs of col 2/3, and 1st 8 pairs of col 18/19 + array512[5] = array512_1; // 2nd 8 pairs of col 2/3, and 2nd 8 pairs of col 18/19 + array512[9] = array512_2; // 1st 8 pairs of col 10/11, and 1st 8 pairs of col 26/27 + array512[13] = array512_3; // 2nd 8 pairs of col 10/11, and 2nd 8 pairs of col 26/27 + + // Half-compose of 4/5, 20/21, 12/13, 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512[2], permute_lo_idx, array512[6]); + array512_1 = _mm512_permutex2var_epi64(array512[10], permute_lo_idx, array512[14]); + array512_2 = _mm512_permutex2var_epi64(array512[2], permute_hi_idx, array512[6]); + array512_3 = _mm512_permutex2var_epi64(array512[10], permute_hi_idx, array512[14]); + array512[2] = array512_0; // 1st 8 pairs of col 4/5, and 1st 8 pairs of col 20/21 + array512[6] = array512_1; // 2nd 8 pairs of col 4/5, and 2nd 8 pairs of col 20/21 + array512[10] = array512_2; // 1st 8 pairs of col 12/13, and 1st 8 pairs of col 28/29 + array512[14] = array512_3; // 2nd 8 pairs of col 12/13, and 2nd 8 pairs of col 28/29 + + // Half-compose of 6/7, 22/23, 14/15, 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512[3], permute_lo_idx, array512[7]); + array512_1 = _mm512_permutex2var_epi64(array512[11], permute_lo_idx, array512[15]); + array512_2 = _mm512_permutex2var_epi64(array512[3], permute_hi_idx, array512[7]); + array512_3 = _mm512_permutex2var_epi64(array512[11], permute_hi_idx, array512[15]); + array512[3] = array512_0; // 1st 8 pairs of col 6/7, and 1st 8 pairs of col 22/23 + array512[7] = array512_1; // 2nd 8 pairs of col 6/7, and 2nd 8 pairs of col 22/23 + array512[11] = array512_2; // 1st 8 pairs of col 14/15, and 1st 8 pairs of col 30/31 + array512[15] = array512_3; // 2nd 8 pairs of col 14/15, and 2nd 8 pairs of col 30/31 + + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[0], _mm512_castsi512_si256(array512[4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 2/3 cols + array512_0 = _mm512_inserti64x4(array512[1], _mm512_castsi512_si256(array512[5]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 4/5 cols + array512_0 = _mm512_inserti64x4(array512[2], _mm512_castsi512_si256(array512[6]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 6/7 cols + array512_0 = _mm512_inserti64x4(array512[3], _mm512_castsi512_si256(array512[7]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 8/9 cols + array512_0 = _mm512_inserti64x4(array512[8], _mm512_castsi512_si256(array512[12]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 10/11 cols + array512_0 = _mm512_inserti64x4(array512[9], _mm512_castsi512_si256(array512[13]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 12/13 cols + array512_0 = _mm512_inserti64x4(array512[10], _mm512_castsi512_si256(array512[14]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 14/15 cols + array512_0 = _mm512_inserti64x4(array512[11], _mm512_castsi512_si256(array512[15]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store 16 ~ k_rem cols + int idx_length = (k_rem + 1 - 16) >> 1; + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } + + dst_addr0 = dst_addr_tmp + 32; + } + } +} + +// K=Any number but will be processed based on 32, 16> 1; + unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem)); + bfloat16 * dst_addr_tmp = dst_addr0; + + for (int j = 0; j < 4; j++) { + int array_idx = j*4; + // Load and preprocess 4 rows + array512[array_idx+0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[array_idx+1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[array_idx+2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[array_idx+3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]); + array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]); + array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + } + + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]); + array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]); + array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]); + array512[j+0] = array512_0; // 1st 8 pairs of col 0/1|2/3|4/5|6/7, and 1st 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+4] = array512_1; // 2nd 8 pairs of col 0/1|2/3|4/5|6/7, and 2nd 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+8] = array512_2; // 1st 8 pairs of col 8/9|10/11|12/13|14/15, and 1st 8 pairs of col 24/25|26/27|28/29|30/31 + array512[j+12] = array512_3; // 2nd 8 pairs of col 8/9|10/11|12/13|14/15, and 2nd 8 pairs of col 24/25|26/27|28/29|30/31 + } + + for (int j = 0; j < 4; j++) { + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int j = 8; j < 12; j++) { + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + // Compose and store 16 ~ k_rem cols + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } + + dst_addr0 = dst_addr_tmp + 32; + + for (int j = 0; j < m_rem; j++) { + array512[j] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+j*lda+tag_k_32x); + } + for (int j = m_rem; j < 16; j++) { + array512[j] = _mm512_setzero_si512(); + } + + for (int j = 0; j < 4; j++) { + int array_idx = j*4; + array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]); + array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]); + array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3); + } + + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]); + array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]); + array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]); + array512[j+0] = array512_0; // 1st 8 pairs of col 0/1|2/3|4/5|6/7, and 1st 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+4] = array512_1; // 2nd 8 pairs of col 0/1|2/3|4/5|6/7, and 2nd 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+8] = array512_2; // 1st 8 pairs of col 8/9|10/11|12/13|14/15, and 1st 8 pairs of col 24/25|26/27|28/29|30/31 + array512[j+12] = array512_3; // 2nd 8 pairs of col 8/9|10/11|12/13|14/15, and 2nd 8 pairs of col 24/25|26/27|28/29|30/31 + } + + for (int j = 0; j < 4; j++) { + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int j = 8; j < 12; j++) { + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + // Compose and store 16 ~ k_rem cols + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } + } +} + +// K=Any number but will be processed based on 32, M=16 +void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0, * dst_addr1; + + BLASLONG tag_k_32x = k & (~31); + + BLASLONG LDA_4x = lda*4; + BLASLONG LDA_8x = lda*8; + BLASLONG LDA_12x = lda*12; + + src_addr0 = A; + src_addr1 = A + lda; + src_addr2 = A + lda*2; + src_addr3 = A + lda*3; + dst_addr0 = block_A; + dst_addr1 = block_A + 32*8; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3; + __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3; + __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3; + + __m512i M512_EPI64_2 = _mm512_set1_epi64(2); + __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2); + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + // Load and preprocess 1st 4 rows + array512_way0_0 = _mm512_loadu_si512(src_addr0+idx_k); + array512_way0_1 = _mm512_loadu_si512(src_addr1+idx_k); + array512_way0_2 = _mm512_loadu_si512(src_addr2+idx_k); + array512_way0_3 = _mm512_loadu_si512(src_addr3+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1); + array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1); + array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3); + array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3); + array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 2nd 4 rows + array512_way1_0 = _mm512_loadu_si512(src_addr0+LDA_4x+idx_k); + array512_way1_1 = _mm512_loadu_si512(src_addr1+LDA_4x+idx_k); + array512_way1_2 = _mm512_loadu_si512(src_addr2+LDA_4x+idx_k); + array512_way1_3 = _mm512_loadu_si512(src_addr3+LDA_4x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1); + array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1); + array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3); + array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3); + array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 3rd 4 rows + array512_way2_0 = _mm512_loadu_si512(src_addr0+LDA_8x+idx_k); + array512_way2_1 = _mm512_loadu_si512(src_addr1+LDA_8x+idx_k); + array512_way2_2 = _mm512_loadu_si512(src_addr2+LDA_8x+idx_k); + array512_way2_3 = _mm512_loadu_si512(src_addr3+LDA_8x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1); + array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1); + array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3); + array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3); + array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 4th 4 rows + array512_way3_0 = _mm512_loadu_si512(src_addr0+LDA_12x+idx_k); + array512_way3_1 = _mm512_loadu_si512(src_addr1+LDA_12x+idx_k); + array512_way3_2 = _mm512_loadu_si512(src_addr2+LDA_12x+idx_k); + array512_way3_3 = _mm512_loadu_si512(src_addr3+LDA_12x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1); + array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1); + array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3); + array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3); + array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Compose and store the 0/1 and 16/17 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 2/3 and 18/19 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 4/5 and 20/21 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 6/7 and 22/23 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 8/9 and 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 10/11 and 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 12/13 and 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 14/15 and 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32*9; + dst_addr1 += 32*9; + } + + if (tag_k_32x != k) { + int k_rem = k - tag_k_32x; + unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem)); + __m512i array512[16]; + + // Load and preprocess 1st 4 rows + array512[0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[0], array512[1]); + array512_1 = _mm512_unpackhi_epi32(array512[0], array512[1]); + array512_2 = _mm512_unpacklo_epi32(array512[2], array512[3]); + array512_3 = _mm512_unpackhi_epi32(array512[2], array512[3]); + array512[0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[3] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 2nd 4 rows + array512[4] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[5] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[6] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[7] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[4], array512[5]); + array512_1 = _mm512_unpackhi_epi32(array512[4], array512[5]); + array512_2 = _mm512_unpacklo_epi32(array512[6], array512[7]); + array512_3 = _mm512_unpackhi_epi32(array512[6], array512[7]); + array512[4] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[5] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[6] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[7] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 3rd 4 rows + array512[8] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[9] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[10] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[11] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[8], array512[9]); + array512_1 = _mm512_unpackhi_epi32(array512[8], array512[9]); + array512_2 = _mm512_unpacklo_epi32(array512[10], array512[11]); + array512_3 = _mm512_unpackhi_epi32(array512[10], array512[11]); + array512[8] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[9] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[10] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[11] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 4th 4 rows + array512[12] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[13] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[14] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[15] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[12], array512[13]); + array512_1 = _mm512_unpackhi_epi32(array512[12], array512[13]); + array512_2 = _mm512_unpacklo_epi32(array512[14], array512[15]); + array512_3 = _mm512_unpackhi_epi32(array512[14], array512[15]); + array512[12] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[13] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[14] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[15] = _mm512_unpackhi_epi64(array512_1, array512_3); + + // array512_01_1617_0, array512_01_1617_1, array512_89_2425_0, array512_89_2425_1; + // Half-compose of 0/1, 16/17, 8/9, 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512[0], permute_lo_idx, array512[4]); + array512_1 = _mm512_permutex2var_epi64(array512[8], permute_lo_idx, array512[12]); + array512_2 = _mm512_permutex2var_epi64(array512[0], permute_hi_idx, array512[4]); + array512_3 = _mm512_permutex2var_epi64(array512[8], permute_hi_idx, array512[12]); + array512[0] = array512_0; // 1st 8 pairs of col 0/1, and 1st 8 pairs of col 16/17 + array512[4] = array512_1; // 2nd 8 pairs of col 0/1, and 2nd 8 pairs of col 16/17 + array512[8] = array512_2; // 1st 8 pairs of col 8/9, and 1st 8 pairs of col 24/25 + array512[12] = array512_3; // 2nd 8 pairs of col 8/9, and 2nd 8 pairs of col 24/25 + + // Half-compose of 2/3, 18/19, 10/11, 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512[1], permute_lo_idx, array512[5]); + array512_1 = _mm512_permutex2var_epi64(array512[9], permute_lo_idx, array512[13]); + array512_2 = _mm512_permutex2var_epi64(array512[1], permute_hi_idx, array512[5]); + array512_3 = _mm512_permutex2var_epi64(array512[9], permute_hi_idx, array512[13]); + array512[1] = array512_0; // 1st 8 pairs of col 2/3, and 1st 8 pairs of col 18/19 + array512[5] = array512_1; // 2nd 8 pairs of col 2/3, and 2nd 8 pairs of col 18/19 + array512[9] = array512_2; // 1st 8 pairs of col 10/11, and 1st 8 pairs of col 26/27 + array512[13] = array512_3; // 2nd 8 pairs of col 10/11, and 2nd 8 pairs of col 26/27 + + // Half-compose of 4/5, 20/21, 12/13, 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512[2], permute_lo_idx, array512[6]); + array512_1 = _mm512_permutex2var_epi64(array512[10], permute_lo_idx, array512[14]); + array512_2 = _mm512_permutex2var_epi64(array512[2], permute_hi_idx, array512[6]); + array512_3 = _mm512_permutex2var_epi64(array512[10], permute_hi_idx, array512[14]); + array512[2] = array512_0; // 1st 8 pairs of col 4/5, and 1st 8 pairs of col 20/21 + array512[6] = array512_1; // 2nd 8 pairs of col 4/5, and 2nd 8 pairs of col 20/21 + array512[10] = array512_2; // 1st 8 pairs of col 12/13, and 1st 8 pairs of col 28/29 + array512[14] = array512_3; // 2nd 8 pairs of col 12/13, and 2nd 8 pairs of col 28/29 + + // Half-compose of 6/7, 22/23, 14/15, 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512[3], permute_lo_idx, array512[7]); + array512_1 = _mm512_permutex2var_epi64(array512[11], permute_lo_idx, array512[15]); + array512_2 = _mm512_permutex2var_epi64(array512[3], permute_hi_idx, array512[7]); + array512_3 = _mm512_permutex2var_epi64(array512[11], permute_hi_idx, array512[15]); + array512[3] = array512_0; // 1st 8 pairs of col 6/7, and 1st 8 pairs of col 22/23 + array512[7] = array512_1; // 2nd 8 pairs of col 6/7, and 2nd 8 pairs of col 22/23 + array512[11] = array512_2; // 1st 8 pairs of col 14/15, and 1st 8 pairs of col 30/31 + array512[15] = array512_3; // 2nd 8 pairs of col 14/15, and 2nd 8 pairs of col 30/31 + + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[0], _mm512_castsi512_si256(array512[4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 2/3 cols + array512_0 = _mm512_inserti64x4(array512[1], _mm512_castsi512_si256(array512[5]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 4/5 cols + array512_0 = _mm512_inserti64x4(array512[2], _mm512_castsi512_si256(array512[6]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 6/7 cols + array512_0 = _mm512_inserti64x4(array512[3], _mm512_castsi512_si256(array512[7]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 8/9 cols + array512_0 = _mm512_inserti64x4(array512[8], _mm512_castsi512_si256(array512[12]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 10/11 cols + array512_0 = _mm512_inserti64x4(array512[9], _mm512_castsi512_si256(array512[13]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 12/13 cols + array512_0 = _mm512_inserti64x4(array512[10], _mm512_castsi512_si256(array512[14]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 14/15 cols + array512_0 = _mm512_inserti64x4(array512[11], _mm512_castsi512_si256(array512[15]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store 16 ~ k_rem cols + int idx_length = (k_rem + 1 - 16) >> 1; + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + } + } +} + +// K=Any number but will be processed based on 32, M<=16 +void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0, * dst_addr1; + + BLASLONG tag_k_32x = k & (~31); + + src_addr0 = A; + dst_addr0 = block_A; + dst_addr1 = block_A + 32*8; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512[16]; + + __m512i M512_EPI64_2 = _mm512_set1_epi64(2); + __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2); + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + for (int j = 0; j < m; j++) { + array512[j] = _mm512_loadu_si512(src_addr0+j*lda+idx_k); + } + for (int j = m; j < 16; j++) { + array512[j] = _mm512_setzero_si512(); + } + + for (int j = 0; j < 4; j++) { + int array_idx = j*4; + array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]); + array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]); + array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3); + } + + // Compose and store the 0/1, 2/3, 4/5, 6/7 and 16/17, 18/19, 20/21, 22/23 cols + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + } + + // Compose and store the 8/9, 10/11, 12/13, 14/15 and 24/25, 26/27, 28/29, 30/31 cols + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + } + + dst_addr0 += 32*8; + dst_addr1 += 32*8; } -#ifdef DEBUG_PROFILE - print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); -#endif + if (tag_k_32x != k) { + int k_rem = k - tag_k_32x; + unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem)); + + for (int j = 0; j < m; j++) { + array512[j] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+j*lda+tag_k_32x); + } + for (int j = m; j < 16; j++) { + array512[j] = _mm512_setzero_si512(); + } + + for (int j = 0; j < 4; j++) { + int array_idx = j*4; + array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]); + array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]); + array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3); + } + + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]); + array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]); + array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]); + array512[j+0] = array512_0; // 1st 8 pairs of col 0/1|2/3|4/5|6/7, and 1st 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+4] = array512_1; // 2nd 8 pairs of col 0/1|2/3|4/5|6/7, and 2nd 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+8] = array512_2; // 1st 8 pairs of col 8/9|10/11|12/13|14/15, and 1st 8 pairs of col 24/25|26/27|28/29|30/31 + array512[j+12] = array512_3; // 2nd 8 pairs of col 8/9|10/11|12/13|14/15, and 2nd 8 pairs of col 24/25|26/27|28/29|30/31 + } + + for (int j = 0; j < 4; j++) { + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + + for (int j = 8; j < 12; j++) { + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + + // Compose and store 16 ~ k_rem cols + int idx_length = (k_rem + 1 - 16) >> 1; + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + } + } } +// COL_MAJOR_ONCOPY_KERNEL_16x32 behaves exactly the same as COL_MAJOR_ITCOPY_KERNEL_Kx16 +#define COL_MAJOR_ONCOPY_KERNEL_16x32 COL_MAJOR_ITCOPY_KERNEL_Kx16 + void COL_MAJOR_ONCOPY_KERNEL_8x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) { BLASLONG tag_k_32x = k & (~31); - BLASLONG idx_src_base0, idx_src_base1, idx_src_base2, idx_src_base3, idx_src_base4, idx_src_base5, idx_src_base6, idx_src_base7; - BLASLONG idx_target_base0; - idx_src_base0 = 0; - idx_src_base1 = 1*ldb; - idx_src_base2 = 2*ldb; - idx_src_base3 = 3*ldb; - idx_src_base4 = 4*ldb; - idx_src_base5 = 5*ldb; - idx_src_base6 = 6*ldb; - idx_src_base7 = 7*ldb; - idx_target_base0 = 0; + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3, * src_addr4, * src_addr5, * src_addr6, * src_addr7; + bfloat16 * dst_addr0; + + unsigned char blend_mask = (((unsigned char)0xcc)); + __m512i permute_idx = _mm512_set_epi64(13, 12, 7, 6, 9, 8, 3, 2); + + src_addr0 = B; + src_addr1 = src_addr0 + 1*ldb; + src_addr2 = src_addr0 + 2*ldb; + src_addr3 = src_addr0 + 3*ldb; + src_addr4 = src_addr0 + 4*ldb; + src_addr5 = src_addr0 + 5*ldb; + src_addr6 = src_addr0 + 6*ldb; + src_addr7 = src_addr0 + 7*ldb; + dst_addr0 = block_B; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3; for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base1+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_loadu_si512(&B[idx_src_base2+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_loadu_si512(&B[idx_src_base3+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_loadu_si512(&B[idx_src_base4+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_loadu_si512(&B[idx_src_base5+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_loadu_si512(&B[idx_src_base6+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_loadu_si512(&B[idx_src_base7+idx_k])); - idx_target_base0 += 32*8; + array512_0 = _mm512_loadu_si512(src_addr0+idx_k); + array512_1 = _mm512_loadu_si512(src_addr1+idx_k); + array512_2 = _mm512_loadu_si512(src_addr2+idx_k); + array512_3 = _mm512_loadu_si512(src_addr3+idx_k); + + array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2); + array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2); + array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3); + array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3); + + array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88); + array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd); + array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88); + array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd); + + array512_0 = _mm512_loadu_si512(src_addr4+idx_k); + array512_1 = _mm512_loadu_si512(src_addr5+idx_k); + array512_2 = _mm512_loadu_si512(src_addr6+idx_k); + array512_3 = _mm512_loadu_si512(src_addr7+idx_k); + + array512_way1_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way1_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way1_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way1_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way1_0, array512_way1_2); + array512_1 = _mm512_unpackhi_epi64(array512_way1_0, array512_way1_2); + array512_2 = _mm512_unpacklo_epi64(array512_way1_1, array512_way1_3); + array512_3 = _mm512_unpackhi_epi64(array512_way1_1, array512_way1_3); + + array512_way1_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x22); + array512_way1_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x77); + array512_way1_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x22); + array512_way1_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x77); + + array512_0 = _mm512_mask_blend_epi64(blend_mask, array512_way0_0, array512_way1_0); + array512_1 = _mm512_mask_blend_epi64(blend_mask, array512_way0_1, array512_way1_1); + array512_2 = _mm512_mask_blend_epi64(blend_mask, array512_way0_2, array512_way1_2); + array512_3 = _mm512_mask_blend_epi64(blend_mask, array512_way0_3, array512_way1_3); + _mm512_storeu_si512(dst_addr0, array512_0); + _mm512_storeu_si512(dst_addr0+32, array512_1); + _mm512_storeu_si512(dst_addr0+64, array512_2); + _mm512_storeu_si512(dst_addr0+96, array512_3); + + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way0_1, permute_idx, array512_way1_1); + array512_2 = _mm512_permutex2var_epi64(array512_way0_2, permute_idx, array512_way1_2); + array512_3 = _mm512_permutex2var_epi64(array512_way0_3, permute_idx, array512_way1_3); + _mm512_storeu_si512(dst_addr0+128, array512_0); + _mm512_storeu_si512(dst_addr0+160, array512_1); + _mm512_storeu_si512(dst_addr0+192, array512_2); + _mm512_storeu_si512(dst_addr0+224, array512_3); + + dst_addr0 += 256; } if (tag_k_32x != k) { unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base1+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base2+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base3+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base4+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base5+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base6+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base7+tag_k_32x])); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + + array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2); + array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2); + array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3); + array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3); + + array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88); + array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd); + array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88); + array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd); + + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr4+tag_k_32x); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr5+tag_k_32x); + array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr6+tag_k_32x); + array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr7+tag_k_32x); + + array512_way1_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way1_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way1_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way1_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way1_0, array512_way1_2); + array512_1 = _mm512_unpackhi_epi64(array512_way1_0, array512_way1_2); + array512_2 = _mm512_unpacklo_epi64(array512_way1_1, array512_way1_3); + array512_3 = _mm512_unpackhi_epi64(array512_way1_1, array512_way1_3); + + array512_way1_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x22); + array512_way1_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x77); + array512_way1_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x22); + array512_way1_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x77); + + + array512_0 = _mm512_mask_blend_epi64(blend_mask, array512_way0_0, array512_way1_0); + array512_1 = _mm512_mask_blend_epi64(blend_mask, array512_way0_1, array512_way1_1); + array512_2 = _mm512_mask_blend_epi64(blend_mask, array512_way0_2, array512_way1_2); + array512_3 = _mm512_mask_blend_epi64(blend_mask, array512_way0_3, array512_way1_3); + _mm512_storeu_si512(dst_addr0, array512_0); + _mm512_storeu_si512(dst_addr0+32, array512_1); + _mm512_storeu_si512(dst_addr0+64, array512_2); + _mm512_storeu_si512(dst_addr0+96, array512_3); + + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way0_1, permute_idx, array512_way1_1); + array512_2 = _mm512_permutex2var_epi64(array512_way0_2, permute_idx, array512_way1_2); + array512_3 = _mm512_permutex2var_epi64(array512_way0_3, permute_idx, array512_way1_3); + _mm512_storeu_si512(dst_addr0+128, array512_0); + _mm512_storeu_si512(dst_addr0+160, array512_1); + _mm512_storeu_si512(dst_addr0+192, array512_2); + _mm512_storeu_si512(dst_addr0+224, array512_3); + } +} + +void COL_MAJOR_ONCOPY_KERNEL_4x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_32x = k & (~31); + + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0; + + src_addr0 = B; + src_addr1 = src_addr0 + 1*ldb; + src_addr2 = src_addr0 + 2*ldb; + src_addr3 = src_addr0 + 3*ldb; + dst_addr0 = block_B; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + array512_0 = _mm512_loadu_si512(src_addr0+idx_k); + array512_1 = _mm512_loadu_si512(src_addr1+idx_k); + array512_2 = _mm512_loadu_si512(src_addr2+idx_k); + array512_3 = _mm512_loadu_si512(src_addr3+idx_k); + + array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2); + array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2); + array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3); + array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3); + + array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88); + array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd); + array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88); + array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd); + + array512_0 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0x88); + array512_1 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0x88); + array512_2 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0xdd); + array512_3 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0xdd); + + _mm512_storeu_si512(dst_addr0, array512_0); + _mm512_storeu_si512(dst_addr0+32, array512_1); + _mm512_storeu_si512(dst_addr0+64, array512_2); + _mm512_storeu_si512(dst_addr0+96, array512_3); + + dst_addr0 += 128; } -#ifdef DEBUG_PROFILE - print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B); -#endif + if (tag_k_32x != k) { + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + + array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2); + array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2); + array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3); + array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3); + + array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88); + array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd); + array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88); + array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd); + + array512_0 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0x88); + array512_1 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0x88); + array512_2 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0xdd); + array512_3 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0xdd); + + _mm512_storeu_si512(dst_addr0, array512_0); + _mm512_storeu_si512(dst_addr0+32, array512_1); + _mm512_storeu_si512(dst_addr0+64, array512_2); + _mm512_storeu_si512(dst_addr0+96, array512_3); + } } void COL_MAJOR_ONCOPY_KERNEL_Nx32(BLASLONG n, BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) { BLASLONG tag_k_32x = k & (~31); BLASLONG tag_n_2x = n & (~1); - BLASLONG idx_src_base0; - BLASLONG idx_target_base0; + + bfloat16 * src_addr0; + bfloat16 * dst_addr0; BLASLONG LDB_2x = 2*ldb; - idx_target_base0 = 0; + src_addr0 = B; + dst_addr0 = block_B; for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { - idx_src_base0 = 0; + src_addr0 = B; for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base0 + ldb + idx_k])); - idx_src_base0 += LDB_2x; - idx_target_base0 += 64; + _mm512_storeu_si512(dst_addr0, _mm512_loadu_si512(src_addr0 + idx_k)); + _mm512_storeu_si512(dst_addr0 + 32, _mm512_loadu_si512(src_addr0 + ldb + idx_k)); + src_addr0 += LDB_2x; + dst_addr0 += 64; } if (tag_n_2x != n) { - _mm512_storeu_si512(&block_B[idx_target_base0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k])); - idx_target_base0 += 32; + _mm512_storeu_si512(dst_addr0, _mm512_loadu_si512(src_addr0 + idx_k)); + dst_addr0 += 32; } } if (tag_k_32x != k) { unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); - idx_src_base0 = 0; + src_addr0 = B; for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + ldb + tag_k_32x])); - idx_src_base0 += LDB_2x; - idx_target_base0 += 64; + _mm512_storeu_si512(dst_addr0, _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + tag_k_32x)); + _mm512_storeu_si512(dst_addr0 + 32, _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + ldb + tag_k_32x)); + src_addr0 += LDB_2x; + dst_addr0 += 64; } if (tag_n_2x != n) { - _mm512_storeu_si512(&block_B[idx_target_base0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x])); + _mm512_storeu_si512(dst_addr0, _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + tag_k_32x)); } } +} + +void COL_MAJOR_OTCOPY_KERNEL_Kx8(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_2x = k & (~1); + unsigned char tail_mask_value = (unsigned char) 0xff; + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); -#ifdef DEBUG_PROFILE - print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B); -#endif + __m128i array128_0, array128_1, array128_2, array128_3; + + BLASLONG idx_src_base0, idx_src_base1; + BLASLONG idx_target_base0, idx_target_base1; + + BLASLONG LDA_2x = 2*ldb; + BLASLONG BF16_BLOCK_T_M_2x = 2*8; + idx_src_base0 = 0; + idx_src_base1 = ldb; + idx_target_base0 = 0; + idx_target_base1 = 8; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]); + array128_1 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base1]); + array128_2 = _mm_unpacklo_epi16(array128_0, array128_1); + array128_3 = _mm_unpackhi_epi16(array128_0, array128_1); + _mm_storeu_epi32(&block_B[idx_target_base0], array128_2); + _mm_storeu_epi32(&block_B[idx_target_base1], array128_3); + + idx_src_base0 += LDA_2x; + idx_src_base1 += LDA_2x; + idx_target_base0 += BF16_BLOCK_T_M_2x; + idx_target_base1 += BF16_BLOCK_T_M_2x; + } + + if (tag_k_2x != k) { + __m128i ZERO128 = _mm_setzero_si128(); + array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]); + array128_2 = _mm_unpacklo_epi16(array128_0, ZERO128); + array128_3 = _mm_unpackhi_epi16(array128_0, ZERO128); + _mm_storeu_epi32(&block_B[idx_target_base0], array128_2); + _mm_storeu_epi32(&block_B[idx_target_base1], array128_3); + } +} + +void COL_MAJOR_OTCOPY_KERNEL_Kx8m(BLASLONG k, BLASLONG n, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_2x = k & (~1); + unsigned char tail_mask = (((unsigned char)0xff) >> (8-n)); + + __m128i array128_0, array128_1, array128_2, array128_3; + + BLASLONG idx_src_base0, idx_src_base1; + BLASLONG idx_target_base0, idx_target_base1; + + BLASLONG LDA_2x = 2*ldb; + BLASLONG BF16_BLOCK_T_M_2x = 2*8; + idx_src_base0 = 0; + idx_src_base1 = ldb; + idx_target_base0 = 0; + idx_target_base1 = 8; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]); + array128_1 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base1]); + array128_2 = _mm_unpacklo_epi16(array128_0, array128_1); + array128_3 = _mm_unpackhi_epi16(array128_0, array128_1); + _mm_storeu_epi32(&block_B[idx_target_base0], array128_2); + _mm_storeu_epi32(&block_B[idx_target_base1], array128_3); + + idx_src_base0 += LDA_2x; + idx_src_base1 += LDA_2x; + idx_target_base0 += BF16_BLOCK_T_M_2x; + idx_target_base1 += BF16_BLOCK_T_M_2x; + } + + if (tag_k_2x != k) { + __m128i ZERO128 = _mm_setzero_si128(); + array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]); + array128_2 = _mm_unpacklo_epi16(array128_0, ZERO128); + array128_3 = _mm_unpackhi_epi16(array128_0, ZERO128); + _mm_storeu_epi32(&block_B[idx_target_base0], array128_2); + _mm_storeu_epi32(&block_B[idx_target_base1], array128_3); + } } -// Scale matrix C while beta is not ZERO or ONE +// Scale matrix C when beta is not ZERO or ONE void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc) { - BLASLONG tag_n_Nx = N & (~3); - BLASLONG tag_n_Mx = M & (~15); + float * C_addr0 = C; + float * C_addr1 = C + ldc; + float * C_addr2 = C + ldc*2; + float * C_addr3 = C + ldc*3; BLASLONG LDC4x = ldc*4; - BLASLONG idx_base_0 = 0; - BLASLONG idx_base_1 = ldc; - BLASLONG idx_base_2 = ldc*2; - BLASLONG idx_base_3 = ldc*3; - - unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); - __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); __m512 array_512_0, array_512_1, array_512_2, array_512_3; + __m512 BETAVECTOR = _mm512_set1_ps(beta); - __m512 BETAVECTOR = _mm512_set1_ps(beta); + if (Order == CblasRowMajor) { + blasint tmp = M; + M = N; + N = tmp; + } - if (Order == CblasColMajor) { - for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { - for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { - array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]); - array_512_1 = _mm512_loadu_ps(&C[idx_base_1+idx_m]); - array_512_2 = _mm512_loadu_ps(&C[idx_base_2+idx_m]); - array_512_3 = _mm512_loadu_ps(&C[idx_base_3+idx_m]); + BLASLONG tag_n_Nx = N & (~3); + BLASLONG tag_n_Mx = M & (~15); + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); + for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + array_512_0 = _mm512_loadu_ps(C_addr0 + idx_m); + array_512_1 = _mm512_loadu_ps(C_addr1 + idx_m); + array_512_2 = _mm512_loadu_ps(C_addr2 + idx_m); + array_512_3 = _mm512_loadu_ps(C_addr3 + idx_m); + + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); + array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); + array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); + + _mm512_storeu_ps(C_addr0 + idx_m, array_512_0); + _mm512_storeu_ps(C_addr1 + idx_m, array_512_1); + _mm512_storeu_ps(C_addr2 + idx_m, array_512_2); + _mm512_storeu_ps(C_addr3 + idx_m, array_512_3); + } - array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); - array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); - array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); - array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); - - _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0); - _mm512_storeu_ps(&C[idx_base_1+idx_m], array_512_1); - _mm512_storeu_ps(&C[idx_base_2+idx_m], array_512_2); - _mm512_storeu_ps(&C[idx_base_3+idx_m], array_512_3); - } + if (tag_n_Mx != M) { + array_512_0 = _mm512_maskz_loadu_ps(tail_mask, C_addr0 + tag_n_Mx); + array_512_1 = _mm512_maskz_loadu_ps(tail_mask, C_addr1 + tag_n_Mx); + array_512_2 = _mm512_maskz_loadu_ps(tail_mask, C_addr2 + tag_n_Mx); + array_512_3 = _mm512_maskz_loadu_ps(tail_mask, C_addr3 + tag_n_Mx); + + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); + array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); + array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); + + _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, array_512_0); + _mm512_mask_storeu_ps(C_addr1 + tag_n_Mx, tail_mask, array_512_1); + _mm512_mask_storeu_ps(C_addr2 + tag_n_Mx, tail_mask, array_512_2); + _mm512_mask_storeu_ps(C_addr3 + tag_n_Mx, tail_mask, array_512_3); + } - if (tag_n_Mx != M) { - array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]); - array_512_1 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_1+tag_n_Mx]); - array_512_2 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_2+tag_n_Mx]); - array_512_3 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_3+tag_n_Mx]); + C_addr0 += LDC4x; + C_addr1 += LDC4x; + C_addr2 += LDC4x; + C_addr3 += LDC4x; + } + if (tag_n_Nx != N) { + for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + array_512_0 = _mm512_loadu_ps(C_addr0 + idx_m); array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); - array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); - array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); - array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); - - _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0); - _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, array_512_1); - _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, array_512_2); - _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, array_512_3); + _mm512_storeu_ps(C_addr0 + idx_m, array_512_0); } - idx_base_0 += LDC4x; - idx_base_1 += LDC4x; - idx_base_2 += LDC4x; - idx_base_3 += LDC4x; - } - - if (tag_n_Nx != N) { - for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { - for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { - array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]); - array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); - _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0); - } - - if (tag_n_Mx != M) { - array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]); - array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); - _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0); - } - idx_base_0 += ldc; + if (tag_n_Mx != M) { + array_512_0 = _mm512_maskz_loadu_ps(tail_mask, C_addr0 + tag_n_Mx); + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, array_512_0); } + C_addr0 += ldc; } - } else { - } } -// Scale matrix C while beta is not ZERO or ONE +// Zero C matrix when Beta is 0 void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, float *C, OPENBLAS_CONST blasint ldc) { - BLASLONG tag_n_Nx = N & (~3); - BLASLONG tag_n_Mx = M & (~15); + float * C_addr0 = C; + float * C_addr1 = C + ldc; + float * C_addr2 = C + ldc*2; + float * C_addr3 = C + ldc*3; BLASLONG LDC4x = ldc*4; - BLASLONG idx_base_0 = 0; - BLASLONG idx_base_1 = ldc; - BLASLONG idx_base_2 = ldc*2; - BLASLONG idx_base_3 = ldc*3; - - unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); - __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); __m512 ZEROVECTOR = _mm512_setzero_ps(); - if (Order == CblasColMajor) { - for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { - for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { - _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR); - _mm512_storeu_ps(&C[idx_base_1+idx_m], ZEROVECTOR); - _mm512_storeu_ps(&C[idx_base_2+idx_m], ZEROVECTOR); - _mm512_storeu_ps(&C[idx_base_3+idx_m], ZEROVECTOR); - } + if (Order == CblasRowMajor) { + blasint tmp = M; + M = N; + N = tmp; + } - if (tag_n_Mx != M) { - _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR); - _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, ZEROVECTOR); - _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, ZEROVECTOR); - _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, ZEROVECTOR); - } + BLASLONG tag_n_Nx = N & (~3); + BLASLONG tag_n_Mx = M & (~15); + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); + for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + _mm512_storeu_ps(C_addr0 + idx_m, ZEROVECTOR); + _mm512_storeu_ps(C_addr1 + idx_m, ZEROVECTOR); + _mm512_storeu_ps(C_addr2 + idx_m, ZEROVECTOR); + _mm512_storeu_ps(C_addr3 + idx_m, ZEROVECTOR); + } - idx_base_0 += LDC4x; - idx_base_1 += LDC4x; - idx_base_2 += LDC4x; - idx_base_3 += LDC4x; + if (tag_n_Mx != M) { + _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(C_addr1 + tag_n_Mx, tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(C_addr2 + tag_n_Mx, tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(C_addr3 + tag_n_Mx, tail_mask, ZEROVECTOR); } - if (tag_n_Nx != N) { - for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { - for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { - _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR); - } + C_addr0 += LDC4x; + C_addr1 += LDC4x; + C_addr2 += LDC4x; + C_addr3 += LDC4x; + } - if (tag_n_Mx != M) { - _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR); - } - idx_base_0 += ldc; + if (tag_n_Nx != N) { + for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + _mm512_storeu_ps(C_addr0 + idx_m, ZEROVECTOR); } - } - } else { + if (tag_n_Mx != M) { + _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, ZEROVECTOR); + } + C_addr0 += ldc; + } } -} \ No newline at end of file +} diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c index dd4cb440b..c71595813 100644 --- a/kernel/x86_64/sbgemm_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c @@ -2,45 +2,115 @@ #include "bf16_common_macros.h" #include +/* These macros are needed and should be placed at the right place +#define BF16_BLOCK_STEP_N 8 +#define BF16_BLOCK_THRES_K 1024 +#define BF16_BLOCK_THRES_M 32 +#define BF16_BLOCK_THRES_N 1024 + +#define A(i,j) A[(i)*lda+(j)] +#define B(i,j) B[(i)*ldb+(j)] +#define C(i,j) C[(i)*ldc+(j)] + +#define ONE 1.e0f +#define ZERO 0.e0f +*/ + #undef STORE16_COMPLETE_RESULT #undef STORE16_MASK_COMPLETE_RESULT -#undef SBGEMM_BLOCK_KERNEL_32x8x32 -#undef SBGEMM_BLOCK_KERNEL_16x8x32 -#undef SBGEMM_BLOCK_KERNEL_32xNx32 -#undef SBGEMM_BLOCK_KERNEL_16xNx32 -#undef SBGEMM_BLOCKING_KERNEL_2 +#undef SBGEMM_BLOCK_KERNEL_NN_32x8xK +#undef SBGEMM_BLOCK_KERNEL_NN_16x8xK +#undef SBGEMM_BLOCK_KERNEL_NN_32xNx32 +#undef SBGEMM_BLOCK_KERNEL_NN_16xNx32 +#undef SBGEMM_BLOCK_KERNEL_NT_32x8xK +#undef SBGEMM_BLOCK_KERNEL_NT_16x8xK +#undef SBGEMM_BLOCK_KERNEL_NT_32xNxK +#undef SBGEMM_BLOCK_KERNEL_NT_16xNxK +#undef SBGEMM_BLOCK_KERNEL_TN_32x8xK +#undef SBGEMM_BLOCK_KERNEL_TN_16x8xK +#undef SBGEMM_BLOCK_KERNEL_TN_32xNx32 +#undef SBGEMM_BLOCK_KERNEL_TN_16xNx32 +#undef SBGEMM_BLOCK_KERNEL_TT_32x8xK +#undef SBGEMM_BLOCK_KERNEL_TT_16x8xK +#undef SBGEMM_BLOCK_KERNEL_TT_32xNxK +#undef SBGEMM_BLOCK_KERNEL_TT_16xNxK +#undef SBGEMM_BLOCKING_KERNEL_NN +#undef SBGEMM_BLOCKING_KERNEL_NT +#undef SBGEMM_BLOCKING_KERNEL_TN +#undef SBGEMM_BLOCKING_KERNEL_TT #ifndef ONE_ALPHA // ALPHA is not ONE - #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE - #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE - #define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_alpha - #define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_alpha - #define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_alpha - #define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_alpha - #define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_alpha + #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE + #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE + + #define SBGEMM_BLOCK_KERNEL_NN_32x8xK sbgemm_block_kernel_nn_32x8xK_alpha + #define SBGEMM_BLOCK_KERNEL_NN_16x8xK sbgemm_block_kernel_nn_16x8xK_alpha + #define SBGEMM_BLOCK_KERNEL_NN_32xNx32 sbgemm_block_kernel_nn_32xNx32_alpha + #define SBGEMM_BLOCK_KERNEL_NN_16xNx32 sbgemm_block_kernel_nn_16xNx32_alpha + + #define SBGEMM_BLOCK_KERNEL_NT_32x8xK SBGEMM_BLOCK_KERNEL_NN_32x8xK + #define SBGEMM_BLOCK_KERNEL_NT_16x8xK SBGEMM_BLOCK_KERNEL_NN_16x8xK + #define SBGEMM_BLOCK_KERNEL_NT_32xNxK sbgemm_block_kernel_nt_32xNxK_alpha + #define SBGEMM_BLOCK_KERNEL_NT_16xNxK sbgemm_block_kernel_nt_16xNxK_alpha + + #define SBGEMM_BLOCK_KERNEL_TN_32x8xK sbgemm_block_kernel_tn_32x8xK_alpha + #define SBGEMM_BLOCK_KERNEL_TN_16x8xK sbgemm_block_kernel_tn_16x8xK_alpha + #define SBGEMM_BLOCK_KERNEL_TN_32xNx32 sbgemm_block_kernel_tn_32xNx32_alpha + #define SBGEMM_BLOCK_KERNEL_TN_16xNx32 sbgemm_block_kernel_tn_16xNx32_alpha + + #define SBGEMM_BLOCK_KERNEL_TT_32x8xK SBGEMM_BLOCK_KERNEL_TN_32x8xK + #define SBGEMM_BLOCK_KERNEL_TT_16x8xK SBGEMM_BLOCK_KERNEL_TN_16x8xK + #define SBGEMM_BLOCK_KERNEL_TT_32xNxK sbgemm_block_kernel_tt_32xNxK_alpha + #define SBGEMM_BLOCK_KERNEL_TT_16xNxK sbgemm_block_kernel_tt_16xNxK_alpha + + #define SBGEMM_BLOCKING_KERNEL_NN sbgemm_blocking_kernel_nn_alpha + #define SBGEMM_BLOCKING_KERNEL_NT sbgemm_blocking_kernel_nt_alpha + #define SBGEMM_BLOCKING_KERNEL_TN sbgemm_blocking_kernel_tn_alpha + #define SBGEMM_BLOCKING_KERNEL_TT sbgemm_blocking_kernel_tt_alpha #else // ALPHA is ONE - #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ONE_ONE - #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ONE_ONE - #define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_one - #define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_one - #define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_one - #define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_one - #define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_one + #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ONE_ONE + #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ONE_ONE + + #define SBGEMM_BLOCK_KERNEL_NN_32x8xK sbgemm_block_kernel_nn_32x8xK_one + #define SBGEMM_BLOCK_KERNEL_NN_16x8xK sbgemm_block_kernel_nn_16x8xK_one + #define SBGEMM_BLOCK_KERNEL_NN_32xNx32 sbgemm_block_kernel_nn_32xNx32_one + #define SBGEMM_BLOCK_KERNEL_NN_16xNx32 sbgemm_block_kernel_nn_16xNx32_one + + #define SBGEMM_BLOCK_KERNEL_NT_32x8xK SBGEMM_BLOCK_KERNEL_NN_32x8xK + #define SBGEMM_BLOCK_KERNEL_NT_16x8xK SBGEMM_BLOCK_KERNEL_NN_16x8xK + #define SBGEMM_BLOCK_KERNEL_NT_32xNxK sbgemm_block_kernel_nt_32xNxK_one + #define SBGEMM_BLOCK_KERNEL_NT_16xNxK sbgemm_block_kernel_nt_16xNxK_one + + #define SBGEMM_BLOCK_KERNEL_TN_32x8xK sbgemm_block_kernel_tn_32x8xK_one + #define SBGEMM_BLOCK_KERNEL_TN_16x8xK sbgemm_block_kernel_tn_16x8xK_one + #define SBGEMM_BLOCK_KERNEL_TN_32xNx32 sbgemm_block_kernel_tn_32xNx32_one + #define SBGEMM_BLOCK_KERNEL_TN_16xNx32 sbgemm_block_kernel_tn_16xNx32_one + + #define SBGEMM_BLOCK_KERNEL_TT_32x8xK SBGEMM_BLOCK_KERNEL_TN_32x8xK + #define SBGEMM_BLOCK_KERNEL_TT_16x8xK SBGEMM_BLOCK_KERNEL_TN_16x8xK + #define SBGEMM_BLOCK_KERNEL_TT_32xNxK sbgemm_block_kernel_tt_32xNxK_one + #define SBGEMM_BLOCK_KERNEL_TT_16xNxK sbgemm_block_kernel_tt_16xNxK_one + + #define SBGEMM_BLOCKING_KERNEL_NN sbgemm_blocking_kernel_nn_one + #define SBGEMM_BLOCKING_KERNEL_NT sbgemm_blocking_kernel_nt_one + #define SBGEMM_BLOCKING_KERNEL_TN sbgemm_blocking_kernel_tn_one + #define SBGEMM_BLOCKING_KERNEL_TT sbgemm_blocking_kernel_tt_one #endif +extern bfloat16 * block_A; +extern bfloat16 * block_B; +/* --------------------------------------------- NN kernels ------------------------------------------ */ // SBGEMM Kernel for 16> (16-m)); - __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); - STORE16_MASK_COMPLETE_RESULT(result_512_0, (&C[ldc*0]), tail_mask) - STORE16_MASK_COMPLETE_RESULT(result_512_1, (&C[ldc*1]), tail_mask) - STORE16_MASK_COMPLETE_RESULT(result_512_2, (&C[ldc*2]), tail_mask) - STORE16_MASK_COMPLETE_RESULT(result_512_3, (&C[ldc*3]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_0, (C_addr), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_1, (C_addr + ldc*1), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3), tail_mask) result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); - STORE16_MASK_COMPLETE_RESULT(result_512_4, (&C[ldc*4]), tail_mask) - STORE16_MASK_COMPLETE_RESULT(result_512_5, (&C[ldc*5]), tail_mask) - STORE16_MASK_COMPLETE_RESULT(result_512_6, (&C[ldc*6]), tail_mask) - STORE16_MASK_COMPLETE_RESULT(result_512_7, (&C[ldc*7]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7), tail_mask) } else { result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); - STORE16_COMPLETE_RESULT(result_512_0, (&C[ldc*0])) - STORE16_COMPLETE_RESULT(result_512_1, (&C[ldc*1])) - STORE16_COMPLETE_RESULT(result_512_2, (&C[ldc*2])) - STORE16_COMPLETE_RESULT(result_512_3, (&C[ldc*3])) + STORE16_COMPLETE_RESULT(result_512_0, (C_addr)) + STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc*1)) + STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2)) + STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3)) result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); - STORE16_COMPLETE_RESULT(result_512_4, (&C[ldc*4])) - STORE16_COMPLETE_RESULT(result_512_5, (&C[ldc*5])) - STORE16_COMPLETE_RESULT(result_512_6, (&C[ldc*6])) - STORE16_COMPLETE_RESULT(result_512_7, (&C[ldc*7])) + STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4)) + STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5)) + STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6)) + STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7)) } } // SBGEMM Kernel for 16> (32-m)); - __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + unsigned short tail_mask = (((unsigned short)0xffff) >> (32-m)); for (int i = 0; i < n; i++) { result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); - STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i])) - STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16]), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16), tail_mask) } } else { for (int i = 0; i < n; i++) { result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); - STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i])) - STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16])) + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i)) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16)) } } } // SBGEMM Kernel for 16<=M, N<8, K can be any number, but the processing will take 32 as a base #ifndef ONE_ALPHA // ALPHA is not ONE -void sbgemm_block_kernel_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +void sbgemm_block_kernel_nn_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) #else // ALPHA is ONE -void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +void sbgemm_block_kernel_nn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) #endif { + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); - BLASLONG idxB_base = 0; - BLASLONG width = 32; #ifndef ONE_ALPHA __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); @@ -432,21 +484,49 @@ void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float a result_512[i+1] = _mm512_setzero_ps(); } - for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { // Load B with unroll n - for (int i = 0; i < n; i ++) { - arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]); - idxB_base += 32; + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_loadu_si512(B_addr); + B_addr += 32; + } + + for (BLASLONG idx = 0; idx < 32;) { + // Each two rows are a group for 32-pair bf16 elements + // Load two rows into a 512 register + arrayA_512 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i ++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); + arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); + } + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); + } + } } + } - if (idx_k == tag_k_32x) {width = k - tag_k_32x;} + if (tag_k_32x != k) { + // Load B with unroll n + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_loadu_si512(B_addr); + B_addr += 32; + } + BLASLONG width = k - tag_k_32x; for (BLASLONG idx = 0; idx < width;) { // Each two rows are a group for 32-pair bf16 elements // Load two rows into a 512 register - arrayA_512 = _mm512_loadu_si512(&A[idx<<4]); + arrayA_512 = _mm512_loadu_si512(A_addr); + A_addr += 32; - for (int i = 0; i < n; i ++) { + for (int i = 0; i < n; i++) { result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); } @@ -462,23 +542,24 @@ void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float a } if (m != 16) { - unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); - __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); for (int i = 0; i < n; i++) { result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); - STORE16_MASK_COMPLETE_RESULT(result_512[i], (&C[ldc*i]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask) } } else { for (int i = 0; i < n; i++) { result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); - STORE16_COMPLETE_RESULT(result_512[i], (&C[ldc*i])) + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) } } } + + #ifndef ONE_ALPHA // ALPHA is not ONE -void sbgemm_blocking_kernel_2_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +void sbgemm_blocking_kernel_nn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) #else // ALPHA is ONE -void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +void sbgemm_blocking_kernel_nn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) #endif { BLASLONG m_step, n_step, k_step, k_step_round32; @@ -499,63 +580,52 @@ void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, while (n_from < N) { for (BLASLONG idx_k = 0; idx_k < K;) { // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... - COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, 0), lda, block_A); - // TODO: MT + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, 0), lda, block_A); for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); - SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + SBGEMM_BLOCK_KERNEL_NN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); } if (tag_n_Nx != n_to) { n_step = n_to - tag_n_Nx; COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); - SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + SBGEMM_BLOCK_KERNEL_NN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); } for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { - COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, idx_m), lda, block_A); + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, idx_m), lda, block_A); for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { - SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + SBGEMM_BLOCK_KERNEL_NN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); } if (tag_n_Nx != n_to) { n_step = n_to - tag_n_Nx; - SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + SBGEMM_BLOCK_KERNEL_NN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); } } if (tag_m_Nx != M) { m_step = M - tag_m_Nx; if (m_step > 16) { - COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); - for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { - SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); - } - - if (tag_n_Nx != n_to) { - n_step = n_to - tag_n_Nx; - SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); - } - } else if (m_step == 16) { - COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { - SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + SBGEMM_BLOCK_KERNEL_NN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); } if (tag_n_Nx != n_to) { n_step = n_to - tag_n_Nx; - SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + SBGEMM_BLOCK_KERNEL_NN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); } } else { - COL_MAJOR_INCOPY_KERNEL_Kx16m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { - SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + SBGEMM_BLOCK_KERNEL_NN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); } if (tag_n_Nx != n_to) { n_step = n_to - tag_n_Nx; - SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + SBGEMM_BLOCK_KERNEL_NN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); } } } @@ -573,22 +643,274 @@ void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); } } else { - m_step = M - tag_m_Nx; + m_step = M; + if (m_step > 16) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } + } +} +/* ----------------------------------------- End of NN kernels --------------------------------------- */ + +/* --------------------------------------------- NT kernels ------------------------------------------ */ +// SBGEMM Kernel for 16> (32-m)); + for (int i = 0; i < n; i ++) { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16), tail_mask) + } + } else { + for (int i = 0; i < n; i ++) { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i)) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16)) + } + } +} + +// SBGEMM Kernel for M<=16, N<8, K can be any number +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_nt_16xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_nt_16xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512_0; + __m512i arrayB_512[8]; + __m512 result_512[8]; + + result_512[0] = _mm512_setzero_ps(); + result_512[1] = _mm512_setzero_ps(); + result_512[2] = _mm512_setzero_ps(); + result_512[3] = _mm512_setzero_ps(); + result_512[4] = _mm512_setzero_ps(); + result_512[5] = _mm512_setzero_ps(); + result_512[6] = _mm512_setzero_ps(); + result_512[7] = _mm512_setzero_ps(); + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) { + // Each two rows are a group for 16-pair bf16 elements + // Load two rows into a 512 register + arrayA_512_0 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i ++) { + _MM512_BROADCASTD_EPI32(B_addr + i*2, arrayB_512[i]); + } + B_addr += 16; + + for (int i = 0; i < n; i ++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]); + } + } + + if (m != 16) { + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); + STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + } + } +} + +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_blocking_kernel_nt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#else // ALPHA is ONE +void sbgemm_blocking_kernel_nt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#endif +{ + BLASLONG m_step, n_step, k_step, k_step_round32; + BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); + + BLASLONG n_from, n_to; + BLASLONG tag_n_Nx; + + n_from = 0; + n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + + k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + + if (M >= BF16_BLOCK_THRES_M) { while (n_from < N) { for (BLASLONG idx_k = 0; idx_k < K;) { // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... - COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, 0), lda, block_A); - // TODO: MT + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, 0), lda, block_A); for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... - COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); - SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); } if (tag_n_Nx != n_to) { n_step = n_to - tag_n_Nx; - COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); - SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, idx_m), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_NT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_NT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + } + } + + if (tag_m_Nx != M) { + m_step = M - tag_m_Nx; + if (m_step > 16) { + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_NT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_NT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else { + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_NT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_NT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } } idx_k += k_step; @@ -597,13 +919,884 @@ void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, k_step_round32 = k_step & (~31); k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; } + n_from = n_to; n_to += BF16_BLOCK_THRES_N; n_to = (n_to > N) ? N : n_to; tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); } + } else { + m_step = M; + if (m_step > 16) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } } } +/* ----------------------------------------- End of NT kernels --------------------------------------- */ + +/* --------------------------------------------- TN kernels ------------------------------------------ */ +// SBGEMM Kernel for 16> (32-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + STORE16_COMPLETE_RESULT(result_512_0, (C_addr)) + STORE16_MASK_COMPLETE_RESULT(result_512_8, (C_addr + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc)) + STORE16_MASK_COMPLETE_RESULT(result_512_9, (C_addr + ldc + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2)) + STORE16_MASK_COMPLETE_RESULT(result_512_10, (C_addr + ldc*2 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3)) + STORE16_MASK_COMPLETE_RESULT(result_512_11, (C_addr + ldc*3 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4)) + STORE16_MASK_COMPLETE_RESULT(result_512_12, (C_addr + ldc*4 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5)) + STORE16_MASK_COMPLETE_RESULT(result_512_13, (C_addr + ldc*5 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6)) + STORE16_MASK_COMPLETE_RESULT(result_512_14, (C_addr + ldc*6 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7)) + STORE16_MASK_COMPLETE_RESULT(result_512_15, (C_addr + ldc*7 + 16), tail_mask) + } else { + STORE16_COMPLETE_RESULT(result_512_0, (C_addr)) + STORE16_COMPLETE_RESULT(result_512_8, (C_addr + 16)) + STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc)) + STORE16_COMPLETE_RESULT(result_512_9, (C_addr + ldc + 16)) + STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2)) + STORE16_COMPLETE_RESULT(result_512_10, (C_addr + ldc*2 + 16)) + STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3)) + STORE16_COMPLETE_RESULT(result_512_11, (C_addr + ldc*3 + 16)) + STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4)) + STORE16_COMPLETE_RESULT(result_512_12, (C_addr + ldc*4 + 16)) + STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5)) + STORE16_COMPLETE_RESULT(result_512_13, (C_addr + ldc*5 + 16)) + STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6)) + STORE16_COMPLETE_RESULT(result_512_14, (C_addr + ldc*6 + 16)) + STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7)) + STORE16_COMPLETE_RESULT(result_512_15, (C_addr + ldc*7 + 16)) + } +} + +// SBGEMM Kernel for M=16, N=8, K=Any number +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_tn_16x8xK_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_tn_16x8xK_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512_0; + __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7; + __m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7; + + result_512_0 = _mm512_setzero_ps(); + result_512_1 = _mm512_setzero_ps(); + result_512_2 = _mm512_setzero_ps(); + result_512_3 = _mm512_setzero_ps(); + result_512_4 = _mm512_setzero_ps(); + result_512_5 = _mm512_setzero_ps(); + result_512_6 = _mm512_setzero_ps(); + result_512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) { + // Load 16 pair of BF16 elements from A (16 rows) + arrayA_512_0 = _mm512_loadu_si512(A_addr + 0); + + // Load 8 rows of B + _MM512_BROADCASTD_EPI32(B_addr + 0, arrayB_512_0); + _MM512_BROADCASTD_EPI32(B_addr + 2, arrayB_512_1); + _MM512_BROADCASTD_EPI32(B_addr + 4, arrayB_512_2); + _MM512_BROADCASTD_EPI32(B_addr + 6, arrayB_512_3); + _MM512_BROADCASTD_EPI32(B_addr + 8, arrayB_512_4); + _MM512_BROADCASTD_EPI32(B_addr + 10, arrayB_512_5); + _MM512_BROADCASTD_EPI32(B_addr + 12, arrayB_512_6); + _MM512_BROADCASTD_EPI32(B_addr + 14, arrayB_512_7); + + result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_0); + result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_1); + result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_2); + result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_3); + result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_4); + result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_5); + result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_6); + result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_7); + + // Load B with unroll 8 + B_addr += 16; + // Load A with unroll 32 + A_addr += 32; + } + + if (m != 16) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + STORE16_MASK_COMPLETE_RESULT(result_512_0, (C_addr), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_1, (C_addr + ldc), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7), tail_mask) + } else { + STORE16_COMPLETE_RESULT(result_512_0, (C_addr)) + STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc)) + STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2)) + STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3)) + STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4)) + STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5)) + STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6)) + STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7)) + } +} + +// SBGEMM Kernel for 16> (32-m)); + for (int i = 0; i < n; i++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + STORE16_MASK_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + STORE16_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16)) + } + } +} + +// SBGEMM Kernel for M<=16, N<8, K=Any number but will be processed based on 32 +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_tn_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_tn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + + int SHUFFLE_MAGIC_NO = 0x39; + BLASLONG tag_k_32x = k & (~31); + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512; + __m512i arrayB_512[8]; + __m512 result_512[8]; + + for (int i = 0; i < 8; i++) { + result_512[i] = _mm512_setzero_ps(); + } + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + // Load B with unroll n + for (int i = 0; i < n; i ++) { + arrayB_512[i] = _mm512_loadu_si512(B_addr); + B_addr += 32; + } + + for (BLASLONG idx = 0; idx < 32;) { + // Each two rows are a group for 32-pair bf16 elements + arrayA_512 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); + arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); + } + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); + } + } + } + } + + if (tag_k_32x != k) { + // Load B with unroll n + for (int i = 0; i < n; i ++) { + arrayB_512[i] = _mm512_loadu_si512(B_addr); + B_addr += 32; + } + + BLASLONG width = k - tag_k_32x; + for (BLASLONG idx = 0; idx < width;) { + // Each two rows are a group for 32-pair bf16 elements + arrayA_512 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); + arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); + } + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); + } + } + } + } + + if (m != 16) { + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); + for (int i = 0; i < n; i++) { + STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + } + } +} + +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_blocking_kernel_tn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#else // ALPHA is ONE +void sbgemm_blocking_kernel_tn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#endif +{ + BLASLONG m_step, n_step, k_step, k_step_round32; + BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); + + BLASLONG n_from, n_to; + BLASLONG tag_n_Nx; + + n_from = 0; + n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + + k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + + if (M >= BF16_BLOCK_THRES_M) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); // TODO how to process m + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { + COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(idx_m, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + } + } + + if (tag_m_Nx != M) { + m_step = M - tag_m_Nx; + if (m_step > 16) { + COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else { + COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + m_step = M; + if (m_step > 16) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } + } +} +/* ----------------------------------------- End of TN kernels --------------------------------------- */ + +/* --------------------------------------------- TT kernels ------------------------------------------ */ +// SBGEMM Kernel for 16> (32-m)); + for (int i = 0; i < n; i ++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + STORE16_MASK_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16), tail_mask) + } + } else { + for (int i = 0; i < n; i ++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + STORE16_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16)) + } + } +} + +// SBGEMM Kernel for M<=16, N<8, K can be any number +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_tt_16xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_tt_16xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512_0; + __m512i arrayB_512[8]; + __m512 result_512[8]; + + result_512[0] = _mm512_setzero_ps(); + result_512[1] = _mm512_setzero_ps(); + result_512[2] = _mm512_setzero_ps(); + result_512[3] = _mm512_setzero_ps(); + result_512[4] = _mm512_setzero_ps(); + result_512[5] = _mm512_setzero_ps(); + result_512[6] = _mm512_setzero_ps(); + result_512[7] = _mm512_setzero_ps(); + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) { + // Each two rows are a group for 16-pair bf16 elements + // Load two rows into a 512 register + arrayA_512_0 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i ++) { + _MM512_BROADCASTD_EPI32(B_addr + i*2, arrayB_512[i]); + } + B_addr += 16; + + for (int i = 0; i < n; i ++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]); + } + } + + if (m != 16) { + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); + for (int i = 0; i < n; i++) { + STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + } + } +} + +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_blocking_kernel_tt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#else // ALPHA is ONE +void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#endif +{ + BLASLONG m_step, n_step, k_step, k_step_round32; + BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); + + BLASLONG n_from, n_to; + BLASLONG tag_n_Nx; + + n_from = 0; + n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + + k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + + if (M >= BF16_BLOCK_THRES_M) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { + COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(idx_m, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + } + } + + if (tag_m_Nx != M) { + m_step = M - tag_m_Nx; + if (m_step > 16) { + COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else { + COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + m_step = M; + if (m_step > 16) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } + } +} +/* ----------------------------------------- End of TT kernels --------------------------------------- */ #ifndef ONE_ALPHA // ALPHA is not ONE void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, @@ -613,13 +1806,33 @@ void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_ OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) #endif { - bfloat16 block_A[BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M]; - bfloat16 block_B[BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K]; - - // TODO: assume no trans for both A and B, to complement these scenarios later if (Order == CblasColMajor) { - SBGEMM_BLOCKING_KERNEL_2(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + if (TransA == CblasNoTrans) { + if (TransB == CblasNoTrans) { + SBGEMM_BLOCKING_KERNEL_NN(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } else if (TransB == CblasTrans) { + SBGEMM_BLOCKING_KERNEL_NT(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } + } else { + if (TransB == CblasNoTrans) { + SBGEMM_BLOCKING_KERNEL_TN(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } else if (TransB == CblasTrans) { + SBGEMM_BLOCKING_KERNEL_TT(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } + } } else { - + if (TransA == CblasNoTrans) { + if (TransB == CblasNoTrans) { + SBGEMM_BLOCKING_KERNEL_NN(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B); + } else if (TransB == CblasTrans) { + SBGEMM_BLOCKING_KERNEL_TN(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B); + } + } else { + if (TransB == CblasNoTrans) { + SBGEMM_BLOCKING_KERNEL_NT(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B); + } else if (TransB == CblasTrans) { + SBGEMM_BLOCKING_KERNEL_TT(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B); + } + } } -} \ No newline at end of file +} From 44d0032f3b8e9794d51b7807b3fb53905a2e9f1c Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 5 Aug 2021 04:43:47 +0000 Subject: [PATCH 058/143] Small Matrix: skylakex: fix build error in old compiler --- kernel/x86_64/dgemm_small_kernel_nn_skylakex.c | 4 ++-- kernel/x86_64/dgemm_small_kernel_nt_skylakex.c | 2 +- kernel/x86_64/dgemm_small_kernel_tn_skylakex.c | 4 ++-- kernel/x86_64/dgemm_small_kernel_tt_skylakex.c | 10 +++++----- kernel/x86_64/sgemm_small_kernel_nt_skylakex.c | 2 +- kernel/x86_64/sgemm_small_kernel_tt_skylakex.c | 6 +++--- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index ff2a04beb..d9b380fff 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -372,8 +372,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, }; - __m512i idx_lo = _mm512_loadu_epi64(permute_table); - __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 8); for (; i < m4; i += 4, mi += 4) { for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); diff --git a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c index 0a95a68e2..e757197ba 100644 --- a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c @@ -385,7 +385,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp for (int ii = 0; ii < 8; ii++) { index_n[ii] = ii * ldc; } - __m512i vindex_n = _mm512_loadu_epi64(index_n); + __m512i vindex_n = _mm512_loadu_si512(index_n); for (; i < m4; i += 4) { for (j = 0; j < n32; j += 32) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c index 0881f35b2..18c797283 100644 --- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -105,8 +105,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, }; - __m512i idx_lo = _mm512_loadu_epi64(permute_table); - __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 8); for (i = 0; i < m4; i += 4) { for (j = 0; j < n4; j += 4) { diff --git a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c index 8ff79d2c8..00f42aa76 100644 --- a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c @@ -189,8 +189,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8, 2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8, }; - __m512i idx_lo = _mm512_loadu_epi64(permute_table); - __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 8); for (i = 0; i < m8; i += 8) { for (j = 0; j < n16; j += 16) { @@ -235,8 +235,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, }; - idx_lo = _mm512_loadu_epi64(permute_table2); - idx_hi = _mm512_loadu_epi64(permute_table2 + 8); + idx_lo = _mm512_loadu_si512(permute_table2); + idx_hi = _mm512_loadu_si512(permute_table2 + 8); for (j = 0; j < n32; j += 32) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); @@ -289,7 +289,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp for (int ii = 0; ii < 8; ii++) { index_n[ii] = ii * ldc; } - __m512i vindex_n = _mm512_loadu_epi64(index_n); + __m512i vindex_n = _mm512_loadu_si512(index_n); #if !defined(B0) __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); #endif diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c index f293bf9f9..a7d87f8c4 100644 --- a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c @@ -385,7 +385,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp for (int ii = 0; ii < 16; ii++) { index_n[ii] = ii * ldc; } - __m512i vindex_n = _mm512_loadu_epi32(index_n); + __m512i vindex_n = _mm512_loadu_si512(index_n); for (; i < m4; i += 4) { for (j = 0; j < n64; j += 64) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); diff --git a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c index 8da560ef7..023f58746 100644 --- a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c @@ -215,8 +215,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, }; - __m512i idx_lo = _mm512_loadu_epi32(permute_table); - __m512i idx_hi = _mm512_loadu_epi32(permute_table + 16); + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); __mmask16 kc = 0xcccc; __mmask16 k3 = 0x3333; __mmask8 mask8 = 0xff; // force use AVX128 instead of SSE @@ -311,7 +311,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp for (int ii = 0; ii < 16; ii++) { index_n[ii] = ii * ldc; } - __m512i vindex_n = _mm512_loadu_epi32(index_n); + __m512i vindex_n = _mm512_loadu_si512(index_n); #if !defined(B0) __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); #endif From c17d6dacb23f0862f6f0318c55c097c361132663 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 5 Aug 2021 05:46:13 +0000 Subject: [PATCH 059/143] Small Matrix: skip compile in unimplemented data type --- interface/gemm.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index 775f654c3..3497d8651 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -105,8 +105,13 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #endif }; -#ifndef GEMM3M -#ifdef SMALL_MATRIX_OPT +#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) && !defined(BFLOAT16) +#define USE_SMALL_MATRIX_OPT 1 +#else +#define USE_SMALL_MATRIX_OPT 0 +#endif + +#if USE_SMALL_MATRIX_OPT #ifndef DYNAMIC_ARCH #define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx])) #else @@ -148,7 +153,6 @@ static size_t zgemm_small_kernel_b0[] = { #define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx)) #endif #endif -#endif #ifndef CBLAS @@ -462,8 +466,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FUNCTION_PROFILE_START(); -#ifndef GEMM3M -#ifdef SMALL_MATRIX_OPT +#if USE_SMALL_MATRIX_OPT #if !defined(COMPLEX) if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){ if(*(FLOAT *)(args.beta) == 0.0){ @@ -483,7 +486,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS return; } #endif -#endif #endif buffer = (XFLOAT *)blas_memory_alloc(0); From e5ba7c3235cd5ac9613e0989621c8d22294def5f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 5 Aug 2021 11:08:18 +0200 Subject: [PATCH 060/143] Disable all x86 jobs --- .travis.yml | 302 ++++++++++++++++++++++++++-------------------------- 1 file changed, 151 insertions(+), 151 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2a221e3bd..8657b64f4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -55,38 +55,38 @@ matrix: - TARGET_BOX=IBMZ_LINUX - BTYPE="BINARY=64 USE_OPENMP=0 CC=clang" - - <<: *test-ubuntu - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 USE_OPENMP=1" - - - <<: *test-ubuntu - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 INTERFACE64=1" - - - <<: *test-ubuntu - compiler: clang - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 CC=clang" - - - <<: *test-ubuntu - compiler: clang - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 INTERFACE64=1 CC=clang" - - - <<: *test-ubuntu - addons: - apt: - packages: - - gcc-multilib - - gfortran-multilib - env: - - TARGET_BOX=LINUX32 - - BTYPE="BINARY=32" - +# - <<: *test-ubuntu +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64 USE_OPENMP=1" +# +# - <<: *test-ubuntu +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64 INTERFACE64=1" +# +# - <<: *test-ubuntu +# compiler: clang +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64 CC=clang" +# +# - <<: *test-ubuntu +# compiler: clang +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64 INTERFACE64=1 CC=clang" +# +# - <<: *test-ubuntu +# addons: +# apt: +# packages: +# - gcc-multilib +# - gfortran-multilib +# env: +# - TARGET_BOX=LINUX32 +# - BTYPE="BINARY=32" +# - os: linux arch: ppc64le dist: bionic @@ -121,47 +121,47 @@ matrix: # for matrix annotation only - TARGET_BOX=PPC64LE_LINUX_P9 - - os: linux - compiler: gcc - addons: - apt: - packages: - - binutils-mingw-w64-x86-64 - - gcc-mingw-w64-x86-64 - - gfortran-mingw-w64-x86-64 - before_script: *common-before - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=WIN64 - - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" - +# - os: linux +# compiler: gcc +# addons: +# apt: +# packages: +# - binutils-mingw-w64-x86-64 +# - gcc-mingw-w64-x86-64 +# - gfortran-mingw-w64-x86-64 +# before_script: *common-before +# script: +# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# env: +# - TARGET_BOX=WIN64 +# - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" +# # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc. # These jobs needs sudo, so Travis runs them on VM-based infrastructure # which is slower than container-based infrastructure used for jobs # that don't require sudo. - - &test-alpine - os: linux - dist: trusty - sudo: true - language: minimal - before_install: - - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ - && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" - - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } - install: - - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' - before_script: *common-before - script: - # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. - - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" - - alpine make -C test $COMMON_FLAGS $BTYPE - - alpine make -C ctest $COMMON_FLAGS $BTYPE - - alpine make -C utest $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64" + # - &test-alpine + # os: linux + # dist: trusty + # sudo: true + # language: minimal + # before_install: + # - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ + # && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" + # - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } + # install: + # - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' + # before_script: *common-before + # script: + # # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. + # - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + # CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" + # - alpine make -C test $COMMON_FLAGS $BTYPE + # - alpine make -C ctest $COMMON_FLAGS $BTYPE + # - alpine make -C utest $COMMON_FLAGS $BTYPE + # env: + # - TARGET_BOX=LINUX64_MUSL + # - BTYPE="BINARY=64" # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS, # but only on Travis CI, cannot reproduce it elsewhere. @@ -171,98 +171,98 @@ matrix: # - TARGET_BOX=LINUX64_MUSL # - BTYPE="BINARY=64 USE_OPENMP=1" - - <<: *test-alpine - env: - - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64 INTERFACE64=1" +# - <<: *test-alpine +# env: +# - TARGET_BOX=LINUX64_MUSL +# - BTYPE="BINARY=64 INTERFACE64=1" +# +# # Build with the same flags as Alpine do in OpenBLAS package. +# - <<: *test-alpine +# env: +# - TARGET_BOX=LINUX64_MUSL +# - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" - # Build with the same flags as Alpine do in OpenBLAS package. - - <<: *test-alpine - env: - - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" +# - &test-cmake +# os: linux +# compiler: clang +# addons: +# apt: +# packages: +# - gfortran +# - cmake +# dist: trusty +# sudo: true +# before_script: +# - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" +# script: +# - mkdir build +# - CONFIG=Release +# - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG +# - cmake --build build --config $CONFIG -- -j2 +# env: +# - CMAKE=1 +# - <<: *test-cmake +# env: +# - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1" +# - <<: *test-cmake +# compiler: gcc +# env: +# - CMAKE=1 - - &test-cmake - os: linux - compiler: clang - addons: - apt: - packages: - - gfortran - - cmake - dist: trusty - sudo: true - before_script: - - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" - script: - - mkdir build - - CONFIG=Release - - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG - - cmake --build build --config $CONFIG -- -j2 - env: - - CMAKE=1 - - <<: *test-cmake - env: - - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1" - - <<: *test-cmake - compiler: gcc - env: - - CMAKE=1 - - - &test-macos - os: osx - osx_image: xcode11.5 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" - - - <<: *test-macos - osx_image: xcode12 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10" - - - <<: *test-macos - osx_image: xcode12 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" +# - &test-macos +# os: osx +# osx_image: xcode11.5 +# before_script: +# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" +# script: +# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# env: +# - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" +# +# - <<: *test-macos +# osx_image: xcode12 +# before_script: +# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" +# - brew update +# script: +# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# env: +# - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10" +# +# - <<: *test-macos +# osx_image: xcode12 +# before_script: +# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" +# - brew update +# script: +# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# env: +# - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" # - <<: *test-macos # osx_image: xcode10 # env: # - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" - - <<: *test-macos - osx_image: xcode11.5 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - env: +# - <<: *test-macos +# osx_image: xcode11.5 +# before_script: +# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" +# - brew update +# env: # - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" # - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" - - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" - - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" - - <<: *test-macos - osx_image: xcode11.5 - env: -# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" -# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" - - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" - - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" +# - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" +# - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" +# - <<: *test-macos +# osx_image: xcode11.5 +# env: +## - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +## - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" +# - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" +# - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" - &test-graviton2 os: linux From b06880c2cdfc8a0bd5caa2c1d62f7bba3611b932 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 10 Aug 2021 22:06:04 -0500 Subject: [PATCH 061/143] POWER10: Improving dasum performance Unrolling a loop in dasum micro code to help in improving POWER10 performance. --- kernel/power/dasum.c | 4 +- kernel/power/dasum_microk_power10.c | 120 ++++++++++++++++++++++++---- 2 files changed, 106 insertions(+), 18 deletions(-) diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 7507621cf..35390dd24 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -115,14 +115,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) - if ( n >= 16 ) + if ( n >= 32) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; for (i = 0; i < align; i++) { sumf += ABS(x[i]); } } - n1 = (n-i) & -16; + n1 = (n-i) & -32; if ( n1 > 0 ) { sumf += dasum_kernel_16(n1, &x[i]); diff --git a/kernel/power/dasum_microk_power10.c b/kernel/power/dasum_microk_power10.c index d1a21b4d1..110627fa4 100644 --- a/kernel/power/dasum_microk_power10.c +++ b/kernel/power/dasum_microk_power10.c @@ -34,6 +34,19 @@ static double dasum_kernel_16 (long n, double *x) __vector double t1; __vector double t2; __vector double t3; + __vector double t4; + __vector double t5; + __vector double t6; + __vector double t7; + __vector double a0; + __vector double a1; + __vector double a2; + __vector double a3; + __vector double a4; + __vector double a5; + __vector double a6; + __vector double a7; + __asm__ ( @@ -48,14 +61,27 @@ static double dasum_kernel_16 (long n, double *x) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" + "xxlxor %x11, %x11, %x11 \n\t" + "xxlxor %x12, %x12, %x12 \n\t" + "xxlxor %x13, %x13, %x13 \n\t" + "xxlxor %x14, %x14, %x14 \n\t" + "xxlxor %x15, %x15, %x15 \n\t" + "xxlxor %x16, %x16, %x16 \n\t" + "xxlxor %x17, %x17, %x17 \n\t" + "xxlxor %x18, %x18, %x18 \n\t" + "lxvp 40, 0(%2) \n\t" "lxvp 42, 32(%2) \n\t" "lxvp 44, 64(%2) \n\t" "lxvp 46, 96(%2) \n\t" + "lxvp 52, 128(%2) \n\t" + "lxvp 54, 160(%2) \n\t" + "lxvp 56, 192(%2) \n\t" + "lxvp 58, 224(%2) \n\t" - "addi %2, %2, 128 \n\t" + "addi %2, %2, 256 \n\t" - "addic. %1, %1, -16 \n\t" + "addic. %1, %1, -32 \n\t" "ble two%= \n\t" ".align 5 \n" @@ -65,33 +91,52 @@ static double dasum_kernel_16 (long n, double *x) "xvabsdp 49, 41 \n\t" "xvabsdp 50, 42 \n\t" "xvabsdp 51, 43 \n\t" - "lxvp 40, 0(%2) \n\t" - "xvabsdp %x3, 44 \n\t" "xvabsdp %x4, 45 \n\t" - "lxvp 42, 32(%2) \n\t" - - "xvabsdp %x5, 46 \n\t" "xvabsdp %x6, 47 \n\t" - "lxvp 44, 64(%2) \n\t" - "xvadddp 32, 32, 48 \n\t" "xvadddp 33, 33, 49 \n\t" - - "lxvp 46, 96(%2) \n\t" - "xvadddp 34, 34, 50 \n\t" "xvadddp 35, 35, 51 \n\t" - "addi %2, %2, 128 \n\t" + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "xvadddp 36, 36, %x3 \n\t" "xvadddp 37, 37, %x4 \n\t" - "addic. %1, %1, -16 \n\t" "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" + "xvabsdp 60, 52 \n\t" + "xvabsdp 61, 53 \n\t" + "xvabsdp 62, 54 \n\t" + "xvabsdp 63, 55 \n\t" + + "xvabsdp %x7, 56 \n\t" + "xvabsdp %x8, 57 \n\t" + "xvabsdp %x9, 58 \n\t" + "xvabsdp %x10, 59 \n\t" + + "xvadddp %x11, %x11, 60 \n\t" + "xvadddp %x12, %x12, 61 \n\t" + "xvadddp %x13, %x13, 62 \n\t" + "xvadddp %x14, %x14, 63 \n\t" + + "lxvp 52, 128(%2) \n\t" + "lxvp 54, 160(%2) \n\t" + "lxvp 56, 192(%2) \n\t" + "lxvp 58, 224(%2) \n\t" + "xvadddp %x15, %x15, %x7 \n\t" + "xvadddp %x16, %x16, %x8 \n\t" + "xvadddp %x17, %x17, %x9 \n\t" + "xvadddp %x18, %x18, %x10 \n\t" + "addi %2, %2, 256 \n\t" + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" "two%=: \n\t" @@ -114,6 +159,25 @@ static double dasum_kernel_16 (long n, double *x) "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" + "xvabsdp 60, 52 \n\t" + "xvabsdp 61, 53 \n\t" + "xvabsdp 62, 54 \n\t" + "xvabsdp 63, 55 \n\t" + + "xvabsdp %x7, 56 \n\t" + "xvabsdp %x8, 57 \n\t" + "xvabsdp %x9, 58 \n\t" + "xvabsdp %x10, 59 \n\t" + "xvadddp %x11, %x11, 60 \n\t" + "xvadddp %x12, %x12, 61 \n\t" + "xvadddp %x13, %x13, 62 \n\t" + "xvadddp %x14, %x14, 63 \n\t" + + "xvadddp %x15, %x15, %x7 \n\t" + "xvadddp %x16, %x16, %x8 \n\t" + "xvadddp %x17, %x17, %x9 \n\t" + "xvadddp %x18, %x18, %x10 \n\t" + "xvadddp 32, 32, 33 \n\t" "xvadddp 34, 34, 35 \n\t" "xvadddp 36, 36, 37 \n\t" @@ -122,7 +186,18 @@ static double dasum_kernel_16 (long n, double *x) "xvadddp 32, 32, 34 \n\t" "xvadddp 36, 36, 38 \n\t" + "xvadddp %x11, %x11, %x12 \n\t" + "xvadddp %x13, %x13, %x14 \n\t" + "xvadddp %x15, %x15, %x16 \n\t" + "xvadddp %x17, %x17, %x18 \n\t" + + "xvadddp %x11, %x11, %x13 \n\t" + "xvadddp %x15, %x15, %x17 \n\t" + + "xvadddp %x11, %x11, %x15 \n\t" + "xvadddp 32, 32, 36 \n\t" + "xvadddp 32, 32, %x11 \n\t" XXSWAPD_S(33,32) "xsadddp %x0, 32, 33 \n" @@ -136,14 +211,27 @@ static double dasum_kernel_16 (long n, double *x) "=wa" (t0), // 3 "=wa" (t1), // 4 "=wa" (t2), // 5 - "=wa" (t3) // 6 + "=wa" (t3), // 6 + "=wa" (t4), // 7 + "=wa" (t5), // 8 + "=wa" (t6), // 9 + "=wa" (t7), // 10 + "=wa" (a0), // 11 + "=wa" (a1), // 12 + "=wa" (a2), // 13 + "=wa" (a3), // 14 + "=wa" (a4), // 15 + "=wa" (a5), // 16 + "=wa" (a6), // 17 + "=wa" (a7) // 18 : "m" (*x) : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", - "vs48","vs49","vs50","vs51" + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" ); return sum; From c28560129f65c212eba0093e99f4c9163856bffa Mon Sep 17 00:00:00 2001 From: cianciosa Date: Wed, 11 Aug 2021 12:00:07 -0400 Subject: [PATCH 062/143] Check the total number of arguments passed insead of if the ARGV# is defined. This fixes a problem when compling openblas as a subproject of another code. --- cmake/utils.cmake | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 6b54092ea..09bae7011 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -157,31 +157,31 @@ endfunction () # STRING - compiles only the given type (e.g. DOUBLE) function(GenerateNamedObjects sources_in) - if (DEFINED ARGV1) + if (${ARGC} GREATER 1) set(defines_in ${ARGV1}) endif () - if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "") + if (${ARGC} GREATER 2 AND NOT "${ARGV2}" STREQUAL "") set(name_in ${ARGV2}) # strip off extension for kernel files that pass in the object name. get_filename_component(name_in ${name_in} NAME_WE) endif () - if (DEFINED ARGV3) + if (${ARGC} GREATER 3) set(use_cblas ${ARGV3}) else () set(use_cblas false) endif () - if (DEFINED ARGV4) + if (${ARGC} GREATER 4) set(replace_last_with ${ARGV4}) endif () - if (DEFINED ARGV5) + if (${ARGC} GREATER 5) set(append_with ${ARGV5}) endif () - if (DEFINED ARGV6) + if ${ARGC} GREATER 6) set(no_float_type ${ARGV6}) else () set(no_float_type false) @@ -196,7 +196,7 @@ function(GenerateNamedObjects sources_in) set(real_only false) set(complex_only false) set(mangle_complex_sources false) - if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "") + if (${ARGC} GREATER 7 AND NOT "${ARGV7}" STREQUAL "") if (${ARGV7} EQUAL 1) set(real_only true) elseif (${ARGV7} EQUAL 2) @@ -342,17 +342,17 @@ endfunction () function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme) set(alternate_name_in "") - if (DEFINED ARGV5) + if (${ARGC} GREATER 5) set(alternate_name_in ${ARGV5}) endif () set(no_float_type false) - if (DEFINED ARGV6) + if (${ARGC} GREATER 6) set(no_float_type ${ARGV6}) endif () set(complex_filename_scheme "") - if (DEFINED ARGV7) + if (${ARGC} GREATER 7) set(complex_filename_scheme ${ARGV7}) endif () From 4c766cd11fa3f27ed1b572225ab2e937e43a2bab Mon Sep 17 00:00:00 2001 From: cianciosa Date: Wed, 11 Aug 2021 12:08:34 -0400 Subject: [PATCH 063/143] Fix a small syntax error. A ( was accidently deleted. --- cmake/utils.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 09bae7011..01b489f2a 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -181,7 +181,7 @@ function(GenerateNamedObjects sources_in) set(append_with ${ARGV5}) endif () - if ${ARGC} GREATER 6) + if (${ARGC} GREATER 6) set(no_float_type ${ARGV6}) else () set(no_float_type false) From a7bc8ec1f107a95a18cfcdbd5c47721abfa75cb9 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 10 Aug 2021 16:42:57 +0800 Subject: [PATCH 064/143] Delete the macro instruction "li" and use "li.d" instead Change-Id: Icff7981e2eb7df29ba5af1f8eb5be8443c67450f --- kernel/loongarch64/asum.S | 2 +- kernel/loongarch64/cnrm2.S | 2 +- kernel/loongarch64/copy.S | 2 +- kernel/loongarch64/dot.S | 2 +- kernel/loongarch64/gemv_n.S | 4 ++-- kernel/loongarch64/gemv_t.S | 2 +- kernel/loongarch64/iamax.S | 12 ++++++------ kernel/loongarch64/iamin.S | 12 ++++++------ kernel/loongarch64/izamax.S | 12 ++++++------ kernel/loongarch64/izamin.S | 12 ++++++------ kernel/loongarch64/scal.S | 2 +- kernel/loongarch64/snrm2.S | 2 +- kernel/loongarch64/swap.S | 2 +- kernel/loongarch64/zcopy.S | 2 +- kernel/loongarch64/zdot.S | 2 +- kernel/loongarch64/zgemv_n.S | 4 ++-- kernel/loongarch64/zgemv_t.S | 2 +- kernel/loongarch64/zscal.S | 2 +- 18 files changed, 40 insertions(+), 40 deletions(-) diff --git a/kernel/loongarch64/asum.S b/kernel/loongarch64/asum.S index e4c717085..7d21ce038 100644 --- a/kernel/loongarch64/asum.S +++ b/kernel/loongarch64/asum.S @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MTC s1, $r0 MTC s2, $r0 slli.d INCX, INCX, BASE_SHIFT - li TEMP, SIZE + li.d TEMP, SIZE bge $r0, N, .L999 srai.d I, N, 3 bne INCX, TEMP, .L20 diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S index c4b2555d3..9d27987e1 100644 --- a/kernel/loongarch64/cnrm2.S +++ b/kernel/loongarch64/cnrm2.S @@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif movgr2fr.d s1, $r0 - li TEMP, 2 * SIZE + li.d TEMP, 2 * SIZE fmov.d s2, s1 bge $r0, N, .L999 slli.d INCX, INCX, ZBASE_SHIFT diff --git a/kernel/loongarch64/copy.S b/kernel/loongarch64/copy.S index 28b7bce4c..3156f60b8 100644 --- a/kernel/loongarch64/copy.S +++ b/kernel/loongarch64/copy.S @@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCY, 0(INCY) #endif - li TEMP, SIZE + li.d TEMP, SIZE NOP slli.d INCX, INCX, BASE_SHIFT bge $r0, N, .L999 diff --git a/kernel/loongarch64/dot.S b/kernel/loongarch64/dot.S index 4fcd569c8..1e4c81a02 100644 --- a/kernel/loongarch64/dot.S +++ b/kernel/loongarch64/dot.S @@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MTC s1, $r0 MTC s2, $r0 slli.d INCX, INCX, BASE_SHIFT - li TEMP, SIZE + li.d TEMP, SIZE slli.d INCY, INCY, BASE_SHIFT bge $r0, N, .L999 srai.d I, N, 3 diff --git a/kernel/loongarch64/gemv_n.S b/kernel/loongarch64/gemv_n.S index 334a2991f..9ab43ae19 100644 --- a/kernel/loongarch64/gemv_n.S +++ b/kernel/loongarch64/gemv_n.S @@ -91,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, M, .L999 slli.d INCY, INCY, BASE_SHIFT bge $r0, N, .L999 - li I, SIZE + li.d I, SIZE move YORIG, Y beq INCY, I, .L10 srai.d I, M, 2 @@ -472,7 +472,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L900: - li YORIG, SIZE + li.d YORIG, SIZE srai.d I, M, 2 beq INCY, YORIG, .L999 move XX, BUFFER diff --git a/kernel/loongarch64/gemv_t.S b/kernel/loongarch64/gemv_t.S index 19333ed4a..af4232769 100644 --- a/kernel/loongarch64/gemv_t.S +++ b/kernel/loongarch64/gemv_t.S @@ -88,7 +88,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, M, .L999 slli.d INCY, INCY, BASE_SHIFT bge $r0, N, .L999 - li I, SIZE + li.d I, SIZE move XORIG, X beq INCX, I, .L10 srai.d I, M, 2 diff --git a/kernel/loongarch64/iamax.S b/kernel/loongarch64/iamax.S index 0f9e1bc59..31b1a9e57 100644 --- a/kernel/loongarch64/iamax.S +++ b/kernel/loongarch64/iamax.S @@ -62,24 +62,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - li x1, 0 + li.d x1, 0 bge $r0, N, .L999 slli.d INCX, INCX, BASE_SHIFT bge $r0, INCX, .L999 LD a1, X, 0 * SIZE addi.d N, N, -1 - li x1, 1 + li.d x1, 1 bge $r0, N, .L999 FABS s1, a1 add.d X, X, INCX FABS s2, a1 - li x2, 1 + li.d x2, 1 FABS s3, a1 srai.d I, N, 3 FABS s4, a1 - li x3, 1 - li TEMP, 2 - li x4, 1 + li.d x3, 1 + li.d TEMP, 2 + li.d x4, 1 bge $r0, I, .L15 LD a1, X, 0 * SIZE add.d X, X, INCX diff --git a/kernel/loongarch64/iamin.S b/kernel/loongarch64/iamin.S index 7751a9d03..9364b9725 100644 --- a/kernel/loongarch64/iamin.S +++ b/kernel/loongarch64/iamin.S @@ -62,24 +62,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - li x1, 0 + li.d x1, 0 bge $r0, N, .L999 slli.d INCX, INCX, BASE_SHIFT bge $r0, INCX, .L999 LD a1, X, 0 * SIZE addi.d N, N, -1 - li x1, 1 + li.d x1, 1 bge $r0, N, .L999 FABS s1, a1 add.d X, X, INCX FABS s2, a1 - li x2, 1 + li.d x2, 1 FABS s3, a1 srai.d I, N, 3 FABS s4, a1 - li x3, 1 - li TEMP, 2 - li x4, 1 + li.d x3, 1 + li.d TEMP, 2 + li.d x4, 1 bge $r0, I, .L15 LD a1, X, 0 * SIZE add.d X, X, INCX diff --git a/kernel/loongarch64/izamax.S b/kernel/loongarch64/izamax.S index 6d7cb9e30..8d3ae529e 100644 --- a/kernel/loongarch64/izamax.S +++ b/kernel/loongarch64/izamax.S @@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - li x1, 0 + li.d x1, 0 bge $r0, N, .L999 slli.d INCX, INCX, ZBASE_SHIFT bge $r0, INCX, .L999 @@ -79,14 +79,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ADD s3, t1, t2 ADD s4, t1, t2 addi.d N, N, -1 - li x1, 1 + li.d x1, 1 bge $r0, N, .L999 add.d X, X, INCX - li x2, 1 + li.d x2, 1 srai.d I, N, 2 - li x3, 1 - li TEMP, 2 - li x4, 1 + li.d x3, 1 + li.d TEMP, 2 + li.d x4, 1 bge $r0, I, .L15 LD a1, X, 0 * SIZE LD a2, X, 1 * SIZE diff --git a/kernel/loongarch64/izamin.S b/kernel/loongarch64/izamin.S index 998927985..38a109c21 100644 --- a/kernel/loongarch64/izamin.S +++ b/kernel/loongarch64/izamin.S @@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - li x1, 0 + li.d x1, 0 bge $r0, N, .L999 slli.d INCX, INCX, ZBASE_SHIFT bge $r0, INCX, .L999 @@ -79,14 +79,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ADD s3, t1, t2 ADD s4, t1, t2 addi.d N, N, -1 - li x1, 1 + li.d x1, 1 bge $r0, N, .L999 add.d X, X, INCX - li x2, 1 + li.d x2, 1 srai.d I, N, 2 - li x3, 1 - li TEMP, 2 - li x4, 1 + li.d x3, 1 + li.d TEMP, 2 + li.d x4, 1 bge $r0, I, .L15 LD a1, X, 0 * SIZE LD a2, X, 1 * SIZE diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S index 7399e57b3..566bce6cb 100644 --- a/kernel/loongarch64/scal.S +++ b/kernel/loongarch64/scal.S @@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE - li TEMP, SIZE + li.d TEMP, SIZE MTC a1, $r0 slli.d INCX, INCX, BASE_SHIFT bge $r0, N, .L999 diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S index 14b62cfe7..57c21a017 100644 --- a/kernel/loongarch64/snrm2.S +++ b/kernel/loongarch64/snrm2.S @@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif movgr2fr.d s1, $r0 - li TEMP, SIZE + li.d TEMP, SIZE fmov.d s2, s1 bge $r0, N, .L999 slli.d INCX, INCX, BASE_SHIFT diff --git a/kernel/loongarch64/swap.S b/kernel/loongarch64/swap.S index c9d8f7fc1..4578a8d54 100644 --- a/kernel/loongarch64/swap.S +++ b/kernel/loongarch64/swap.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE - li TEMP, SIZE + li.d TEMP, SIZE slli.d INCX, INCX, BASE_SHIFT bge $r0, N, .L999 slli.d INCY, INCY, BASE_SHIFT diff --git a/kernel/loongarch64/zcopy.S b/kernel/loongarch64/zcopy.S index 3fbe56074..0f480ca85 100644 --- a/kernel/loongarch64/zcopy.S +++ b/kernel/loongarch64/zcopy.S @@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCY, 0(INCY) #endif - li TEMP, 2 * SIZE + li.d TEMP, 2 * SIZE NOP slli.d INCX, INCX, ZBASE_SHIFT bge $r0, N, .L999 diff --git a/kernel/loongarch64/zdot.S b/kernel/loongarch64/zdot.S index 087c3845f..81ac19fbd 100644 --- a/kernel/loongarch64/zdot.S +++ b/kernel/loongarch64/zdot.S @@ -62,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MOV s3, s2 MOV s4, s3 slli.d INCX, INCX, ZBASE_SHIFT - li TEMP, 2 * SIZE + li.d TEMP, 2 * SIZE slli.d INCY, INCY, ZBASE_SHIFT bge $r0, N, .L999 srai.d I, N, 2 diff --git a/kernel/loongarch64/zgemv_n.S b/kernel/loongarch64/zgemv_n.S index 0cc49c789..d995ce86b 100644 --- a/kernel/loongarch64/zgemv_n.S +++ b/kernel/loongarch64/zgemv_n.S @@ -123,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, M, .L999 slli.d INCY, INCY, ZBASE_SHIFT bge $r0, N, .L999 - li I, 2 * SIZE + li.d I, 2 * SIZE move YORIG, Y beq INCY, I, .L10 srai.d I, M, 2 @@ -576,7 +576,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L900: - li YORIG, 2 * SIZE + li.d YORIG, 2 * SIZE srai.d I, M, 2 beq INCY, YORIG, .L999 move XX, BUFFER diff --git a/kernel/loongarch64/zgemv_t.S b/kernel/loongarch64/zgemv_t.S index 85a9a0c0d..841823e1c 100644 --- a/kernel/loongarch64/zgemv_t.S +++ b/kernel/loongarch64/zgemv_t.S @@ -116,7 +116,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, M, .L999 slli.d INCY, INCY, ZBASE_SHIFT bge $r0, N, .L999 - li I, 2 * SIZE + li.d I, 2 * SIZE move XORIG, X beq INCX, I, .L10 srai.d I, M, 2 diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S index fe53ed713..a12e527a5 100644 --- a/kernel/loongarch64/zscal.S +++ b/kernel/loongarch64/zscal.S @@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE - li TEMP, 2 * SIZE + li.d TEMP, 2 * SIZE MTC a1, $r0 slli.d INCX, INCX, ZBASE_SHIFT bge $r0, N, .L999 From 989e6bbdd39fe3d49789b803c4fd6b20a3a673e5 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 13 Aug 2021 03:17:38 +0000 Subject: [PATCH 065/143] Small Matrix: reduce generic kernel source files --- kernel/CMakeLists.txt | 56 ++++----- kernel/Makefile.L3 | 112 +++++++++--------- .../generic/gemm_small_matrix_kernel_b0_nn.c | 49 -------- .../generic/gemm_small_matrix_kernel_b0_nt.c | 49 -------- .../generic/gemm_small_matrix_kernel_b0_tn.c | 49 -------- .../generic/gemm_small_matrix_kernel_b0_tt.c | 49 -------- kernel/generic/gemm_small_matrix_kernel_nn.c | 11 +- kernel/generic/gemm_small_matrix_kernel_nt.c | 9 +- kernel/generic/gemm_small_matrix_kernel_tn.c | 8 ++ kernel/generic/gemm_small_matrix_kernel_tt.c | 8 ++ .../generic/zgemm_small_matrix_kernel_b0_nn.c | 74 ------------ .../generic/zgemm_small_matrix_kernel_b0_nt.c | 77 ------------ .../generic/zgemm_small_matrix_kernel_b0_tn.c | 77 ------------ .../generic/zgemm_small_matrix_kernel_b0_tt.c | 77 ------------ kernel/generic/zgemm_small_matrix_kernel_nn.c | 11 ++ kernel/generic/zgemm_small_matrix_kernel_nt.c | 11 ++ kernel/generic/zgemm_small_matrix_kernel_tn.c | 11 ++ kernel/generic/zgemm_small_matrix_kernel_tt.c | 11 ++ 18 files changed, 161 insertions(+), 588 deletions(-) delete mode 100644 kernel/generic/gemm_small_matrix_kernel_b0_nn.c delete mode 100644 kernel/generic/gemm_small_matrix_kernel_b0_nt.c delete mode 100644 kernel/generic/gemm_small_matrix_kernel_b0_tn.c delete mode 100644 kernel/generic/gemm_small_matrix_kernel_b0_tt.c delete mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_nn.c delete mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_nt.c delete mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_tn.c delete mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_tt.c diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 769a73b91..d8a230436 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -495,30 +495,30 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) endif () if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NN) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") - set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_b0_nn.c) + set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_nn.c) else () - set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_b0_nn.c) + set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) endif () endif () if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NT) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") - set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_b0_nt.c) + set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_nt.c) else () - set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_b0_nt.c) + set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) endif () endif () if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TN) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") - set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_b0_tn.c) + set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_tn.c) else () - set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_b0_tn.c) + set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) endif () endif () if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TT) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") - set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_b0_tt.c) + set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_tt.c) else () - set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_b0_tt.c) + set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) endif () endif () @@ -541,32 +541,32 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR" "gemm_small_kernel_b0_nr" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN" "gemm_small_kernel_b0_rn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR" "gemm_small_kernel_b0_rr" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC" "gemm_small_kernel_b0_nc" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT" "gemm_small_kernel_b0_rt" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC" "gemm_small_kernel_b0_rc" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR" "gemm_small_kernel_b0_tr" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN" "gemm_small_kernel_b0_cn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR" "gemm_small_kernel_b0_cr" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC" "gemm_small_kernel_b0_tc" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT" "gemm_small_kernel_b0_ct" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC" "gemm_small_kernel_b0_cc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false ${float_type}) else () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) endif () endif () diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f977793a0..ef11e391c 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -4334,32 +4334,32 @@ $(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ ifndef DGEMM_SMALL_K_B0_NN -DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c +DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c endif ifndef DGEMM_SMALL_K_B0_NT -DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c +DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c endif ifndef DGEMM_SMALL_K_B0_TN -DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c +DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c endif ifndef DGEMM_SMALL_K_B0_TT -DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c +DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c endif $(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ ifndef SGEMM_SMALL_M_PERMIT SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c @@ -4397,32 +4397,32 @@ $(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ ifndef SGEMM_SMALL_K_B0_NN -SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c +SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c endif ifndef SGEMM_SMALL_K_B0_NT -SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c +SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c endif ifndef SGEMM_SMALL_K_B0_TN -SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c +SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c endif ifndef SGEMM_SMALL_K_B0_TT -SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c +SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c endif $(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ ifndef CGEMM_SMALL_M_PERMIT CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c @@ -4496,68 +4496,68 @@ $(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ ifndef CGEMM_SMALL_K_B0_NN -CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c +CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c endif ifndef CGEMM_SMALL_K_B0_NT -CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c +CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c endif ifndef CGEMM_SMALL_K_B0_TN -CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c +CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c endif ifndef CGEMM_SMALL_K_B0_TT -CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c +CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c endif $(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ ifndef ZGEMM_SMALL_M_PERMIT ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c @@ -4632,65 +4632,65 @@ $(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ ifndef ZGEMM_SMALL_K_B0_NN -ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c +ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c endif ifndef ZGEMM_SMALL_K_B0_NT -ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c +ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c endif ifndef ZGEMM_SMALL_K_B0_TN -ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c +ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c endif ifndef ZGEMM_SMALL_K_B0_TT -ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c +ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c endif $(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ diff --git a/kernel/generic/gemm_small_matrix_kernel_b0_nn.c b/kernel/generic/gemm_small_matrix_kernel_b0_nn.c deleted file mode 100644 index 3be918017..000000000 --- a/kernel/generic/gemm_small_matrix_kernel_b0_nn.c +++ /dev/null @@ -1,49 +0,0 @@ -/*************************************************************************** -Copyright (c) 2020, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc) -{ - //naive implemtation - //Column major - - BLASLONG i,j,k; - FLOAT result=0.0; - - for(i=0; i Date: Fri, 13 Aug 2021 03:28:44 +0000 Subject: [PATCH 066/143] Small Matrix: skylakex: remove unnecessary b0 source files --- kernel/x86_64/KERNEL.SKYLAKEX | 16 ++++++++-------- .../x86_64/dgemm_small_kernel_b0_nn_skylakex.c | 2 -- .../x86_64/dgemm_small_kernel_b0_nt_skylakex.c | 2 -- .../x86_64/dgemm_small_kernel_b0_tn_skylakex.c | 2 -- .../x86_64/dgemm_small_kernel_b0_tt_skylakex.c | 2 -- .../x86_64/sgemm_small_kernel_b0_nn_skylakex.c | 2 -- .../x86_64/sgemm_small_kernel_b0_nt_skylakex.c | 2 -- .../x86_64/sgemm_small_kernel_b0_tn_skylakex.c | 2 -- .../x86_64/sgemm_small_kernel_b0_tt_skylakex.c | 3 --- 9 files changed, 8 insertions(+), 25 deletions(-) delete mode 100644 kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c delete mode 100644 kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c delete mode 100644 kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c delete mode 100644 kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c delete mode 100644 kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c delete mode 100644 kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c delete mode 100644 kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c delete mode 100644 kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index eb0cbaf98..6b4961bc2 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -12,13 +12,13 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c SGEMM_SMALL_M_PERMIT = sgemm_small_kernel_permit_skylakex.c SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c -SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c +SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_skylakex.c SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c -SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_b0_nt_skylakex.c +SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_skylakex.c SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_skylakex.c -SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_b0_tn_skylakex.c +SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_skylakex.c SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_skylakex.c -SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_b0_tt_skylakex.c +SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_skylakex.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c @@ -29,13 +29,13 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DGEMM_SMALL_M_PERMIT = dgemm_small_kernel_permit_skylakex.c DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c -DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c +DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_skylakex.c DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c -DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_b0_nt_skylakex.c +DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_skylakex.c DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_skylakex.c -DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_b0_tn_skylakex.c +DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_skylakex.c DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_skylakex.c -DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_b0_tt_skylakex.c +DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_skylakex.c SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c deleted file mode 100644 index a58738a25..000000000 --- a/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./dgemm_small_kernel_nn_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c deleted file mode 100644 index eafe2ce49..000000000 --- a/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./dgemm_small_kernel_nt_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c deleted file mode 100644 index 1dfa0aaf1..000000000 --- a/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./dgemm_small_kernel_tn_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c deleted file mode 100644 index 93fab1836..000000000 --- a/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./dgemm_small_kernel_tt_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c deleted file mode 100644 index 704e964b8..000000000 --- a/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sgemm_small_kernel_nn_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c deleted file mode 100644 index 6d7934be1..000000000 --- a/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sgemm_small_kernel_nt_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c deleted file mode 100644 index 0f9745b72..000000000 --- a/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sgemm_small_kernel_tn_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c deleted file mode 100644 index 27d9e0afd..000000000 --- a/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c +++ /dev/null @@ -1,3 +0,0 @@ -#define B0 1 -#define TT 1 -#include "./sgemm_small_kernel_tt_skylakex.c" From 13d411677f4b0a617142b3fd4c15d7be4c442477 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Aug 2021 00:17:23 +0200 Subject: [PATCH 067/143] Add more OSX build jobs to Azure CI (#3338) * Add OSX build job with Homebrew OpenMP in a CMAKE build * Check install step on OSX/gcc to make sure all include files are generated and installed as intended * Add mixed clang/gfortran build with cmake on OSX * move IOS ARMV7/ARMV8 crossbuilds from travis to azure --- azure-pipelines.yml | 56 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 889b920e3..b1bded639 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -83,6 +83,8 @@ jobs: - script: | brew update make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install + ls -lR ../blasinst - job: OSX_GCC_Nothreads pool: @@ -104,6 +106,38 @@ jobs: brew install llvm libomp make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10 +- job: OSX_OpenMP_Clang_cmake + pool: + vmImage: 'macOS-10.15' + variables: + LD_LIBRARY_PATH: /usr/local/opt/llvm/lib + LIBRARY_PATH: /usr/local/opt/llvm/lib + steps: + - script: | + brew update + brew install llvm libomp + mkdir build + cd build + cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNOFORTRAN=1 -DNO_AVX512=1 .. + make + ctest + +- job: OSX_OpenMP_Clang_gf_cmake + pool: + vmImage: 'macOS-10.15' + variables: + LD_LIBRARY_PATH: /usr/local/opt/llvm/lib + LIBRARY_PATH: /usr/local/opt/llvm/lib + steps: + - script: | + brew update + brew install llvm libomp + mkdir build + cd build + cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 .. + make + ctest + - job: OSX_Ifort_Clang pool: vmImage: 'macOS-10.15' @@ -146,7 +180,27 @@ jobs: brew install --cask android-ndk export ANDROID_NDK_HOME=/usr/local/share/android-ndk make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 - + +- job: OSX_IOS_ARMV8 + pool: + vmImage: 'macOS-10.15' + variables: + CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0 + steps: + - script: | + make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 + +- job: OSX_IOS_ARMV7 + pool: + vmImage: 'macOS-10.15' + variables: + CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1 + steps: + - script: | + make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 + - job: ALPINE_MUSL pool: vmImage: 'ubuntu-latest' From cdb5d2737e92d17c600903bf97ac32d1659ce324 Mon Sep 17 00:00:00 2001 From: Niyas Sait Date: Mon, 16 Aug 2021 11:22:51 +0100 Subject: [PATCH 068/143] add support for building on windows/arm64 target --- common_arm64.h | 2 +- ctest.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common_arm64.h b/common_arm64.h index 2270ffba7..029e23886 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -120,7 +120,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ .text ; .p2align 2 ; .global REALNAME ; -#ifndef __APPLE__ +#if !defined(__APPLE__) && !defined(_WIN32) .type REALNAME, %function ; #endif REALNAME: diff --git a/ctest.c b/ctest.c index 4f18918f5..2afd93f68 100644 --- a/ctest.c +++ b/ctest.c @@ -84,7 +84,7 @@ OS_AIX OS_OSF #endif -#if defined(__WIN32) || defined(__WIN64) || defined(__WINNT) +#if defined(__WIN32) || defined(__WIN64) || defined(_WIN32) || defined(_WIN64) || defined(__WINNT) OS_WINNT #endif @@ -141,7 +141,7 @@ ARCH_SPARC ARCH_IA64 #endif -#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) +#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) || defined(__aarch64__) BINARY_64 #endif From c6c2a71fb7c4ea36558c911f964557b7ac3a35c8 Mon Sep 17 00:00:00 2001 From: Niyas Sait Date: Mon, 16 Aug 2021 11:25:07 +0100 Subject: [PATCH 069/143] Fix ctest.h to build using clang on windows --- utest/ctest.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/utest/ctest.h b/utest/ctest.h index 037f7f28d..79961badf 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -65,9 +65,14 @@ struct ctest { #undef CTEST_SEGFAULT #endif -#if defined(_WIN32) && defined(_MSC_VER) +#if defined(_WIN32) +#if defined(__clang__) +#define __CTEST_NO_TIME +#undef CTEST_SEGFAULT +#elif defined(_MSC_VER) #define __CTEST_MSVC #endif +#endif //config for MSVC compiler #ifdef __CTEST_MSVC @@ -286,7 +291,7 @@ void assert_dbl_far(double exp, double real, double tol, const char* caller, int #endif #include -#ifdef __CTEST_MSVC +#ifdef _WIN32 #include #else #include From e9acb464318618009d13ddcc7e30dc300e878052 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 25 Aug 2021 07:07:27 +0000 Subject: [PATCH 070/143] sgemv: skylakex: bug fix for sgemv_t kernel in corner case --- kernel/x86_64/sgemv_t_4.c | 2 +- .../x86_64/sgemv_t_microk_skylakex_template.c | 23 ++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 76236cd16..a36c8ace9 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_haswell-4.c" #elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "sgemv_t_microk_haswell-4.c" -/*#include "sgemv_t_microk_skylakex.c"*/ +#include "sgemv_t_microk_skylakex.c" #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) diff --git a/kernel/x86_64/sgemv_t_microk_skylakex_template.c b/kernel/x86_64/sgemv_t_microk_skylakex_template.c index 34415054c..423413465 100644 --- a/kernel/x86_64/sgemv_t_microk_skylakex_template.c +++ b/kernel/x86_64/sgemv_t_microk_skylakex_template.c @@ -93,7 +93,7 @@ static int sgemv_kernel_t_1(BLASLONG m, float alpha, float *a, float *x, float * } if (tag_m_32x != m) { - for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_16x; idx_m+=32) { + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); @@ -145,8 +145,8 @@ static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float * } if (tag_m_32x != m) { for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { - m0 = _mm512_loadu_ps(&a[idx_m]); - m1 = _mm512_loadu_ps(&a[idx_m + 16]); + m0 = _mm512_loadu_ps(&a[idx_m*2]); + m1 = _mm512_loadu_ps(&a[idx_m*2 + 16]); col1_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1); col1_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1); _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m]))); @@ -157,7 +157,7 @@ static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float * __mmask8 load_mask = *((__mmask8*) &load_mask_value); x1Array = _mm512_broadcast_f32x2(_mm_maskz_loadu_ps(load_mask, x)); for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { - m0 = _mm512_loadu_ps(&a[idx_m]); + m0 = _mm512_loadu_ps(&a[idx_m*2]); m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR); m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1); __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0)); @@ -171,7 +171,7 @@ static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float * unsigned char y_mask_value = (((unsigned char)0xff) >> (8-(m-tag_m_8x))); __mmask8 y_mask = *((__mmask8*) &y_mask_value); - m0 = _mm512_maskz_loadu_ps(a_mask, &a[tag_m_8x]); + m0 = _mm512_maskz_loadu_ps(a_mask, &a[tag_m_8x*2]); m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR); m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1); __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0)); @@ -346,7 +346,7 @@ static int sgemv_kernel_t_4(BLASLONG m, float alpha, float *a, float *x, float * c3 = _mm256_extractf32x4_ps(c256_2, 0); c4 = _mm256_extractf32x4_ps(c256_2, 1); - ret = _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, c1, c2), _mm_maskz_add_ps(0xff, c3, c4)), _mm_maskz_loadu_ps(0xff, y)); + ret = _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, c1, c2), _mm_maskz_add_ps(0xff, c3, c4)), _mm_maskz_loadu_ps(0xff, &y[idx_m])); _mm_mask_storeu_ps(&y[idx_m], 0xff, ret); } @@ -958,6 +958,7 @@ static int sgemv_kernel_t_7(BLASLONG m, float alpha, float *a, float *x, float * c256_1 = _mm512_extractf32x8_ps(tmp0, 1); c256_0 = _mm256_add_ps(c256_0, c256_1); + c256_0 = _mm256_mul_ps(c256_0, alpha256); __m128 c128_0 = _mm256_extractf32x4_ps(c256_0, 0); __m128 c128_1 = _mm256_extractf32x4_ps(c256_0, 1); @@ -1016,9 +1017,10 @@ static int sgemv_kernel_t_8(BLASLONG m, float alpha, float *a, float *x, float * __m512 m0, m1, m2, m3; __m256 r0, r1, r2, r3, r4, r5, r6, r7, tmp0, tmp1, tmp2, tmp3; __m128 c128_0, c128_1, c128_2, c128_3; - __m128 alpha128 = _mm_set1_ps(alpha); + __m256 alpha256 = _mm256_set1_ps(alpha); __m256 x256 = _mm256_loadu_ps(x); + x256 = _mm256_mul_ps(x256, alpha256); __m512 x512 = _mm512_broadcast_f32x8(x256); for(BLASLONG idx_m=0; idx_m Date: Wed, 25 Aug 2021 07:13:00 +0000 Subject: [PATCH 071/143] sgemv: skylakex: fix build warning --- kernel/x86_64/sgemv_n_4.c | 3 --- kernel/x86_64/sgemv_t_microk_skylakex_template.c | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 06de28d97..90865c4b3 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -302,9 +302,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT * xbuffer_align = x; FLOAT * ybuffer_align = y; - FLOAT * xbuffer = NULL; - FLOAT * ybuffer = NULL; - if (inc_x != 1) { xbuffer_align = buffer; for(BLASLONG i=0; i> (16-((m-tag_m_8x)*2)&15)); + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(((m-tag_m_8x)*2)&15))); __mmask16 a_mask = *((__mmask16*) &tail_mask_value); unsigned char y_mask_value = (((unsigned char)0xff) >> (8-(m-tag_m_8x))); __mmask8 y_mask = *((__mmask8*) &y_mask_value); @@ -322,7 +322,7 @@ static int sgemv_kernel_t_4(BLASLONG m, float alpha, float *a, float *x, float * { BLASLONG tag_m_4x = m & (~3); BLASLONG tag_m_2x = m & (~1); - __m512 m0, m1, m2; + __m512 m0, m1; __m256 m256_0, m256_1, c256_1, c256_2; __m128 c1, c2, c3, c4, ret; __m128 xarray = _mm_maskz_loadu_ps(0x0f, x); From 7d1becc575d436039f1484259a10413aade9cda9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 28 Aug 2021 14:18:36 +0200 Subject: [PATCH 072/143] Allocate an auxiliary struct when running out of preconfigured threads --- driver/others/memory.c | 145 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 142 insertions(+), 3 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 460a3d557..377e073ee 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2636,8 +2636,25 @@ static volatile struct { } memory[NUM_BUFFERS]; -static int memory_initialized = 0; +static volatile struct newmemstruct +{ + BLASULONG lock; + void *addr; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int pos; +#endif + int used; +#ifndef __64BIT__ + char dummy[48]; +#else + char dummy[40]; +#endif +}; +static volatile struct newmemstruct *newmemory; + +static int memory_initialized = 0; +static int memory_overflowed = 0; /* Memory allocation routine */ /* procpos ... indicates where it comes from */ /* 0 : Level 3 functions */ @@ -2779,6 +2796,29 @@ void *blas_memory_alloc(int procpos){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif + if (memory_overflowed) { +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif + do { + RMB; +#if defined(USE_OPENMP) + if (!newmemory[position-NUM_BUFFERS].used) { + blas_lock(&newmemory[position-NUM_BUFFERS].lock); +#endif + if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; + +#if defined(USE_OPENMP) + blas_unlock(&newmemory[position-NUM_BUFFERS].lock); + } +#endif + position ++; + + } while (position < 512+NUM_BUFFERS); +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif +} goto error; allocation : @@ -2883,6 +2923,90 @@ void *blas_memory_alloc(int procpos){ return (void *)memory[position].addr; error: + if (memory_overflowed) goto terminate; + printf("num_buffers exceeded, adding auxiliary array\n"); + memory_overflowed=1; + newmemory= (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); + for (int i=0;i<512;i++) { + newmemory[i].addr = (void *)0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + newmemory[i].pos = -1; +#endif + newmemory[i].used = 0; + newmemory[i].lock = 0; +} + newmemory[position-NUM_BUFFERS].used = 1; + +allocation2: + newmemory[position-NUM_BUFFERS].used = 1; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#else + blas_unlock(&newmemory[position-NUM_BUFFERS].lock); +#endif + do { +#ifdef DEBUG + printf("Allocation Start : %lx\n", base_address); +#endif + + map_address = (void *)-1; + + func = &memoryalloc[0]; + + while ((func != NULL) && (map_address == (void *) -1)) { + + map_address = (*func)((void *)base_address); + +#ifdef ALLOC_DEVICEDRIVER + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); + } +#endif + +#ifdef ALLOC_HUGETLBFILE + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { +#ifndef OS_WINDOWS + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); +#endif + } +#endif + +#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; +#endif + + func ++; + } + +#ifdef DEBUG + printf(" Success -> %08lx\n", map_address); +#endif + if (((BLASLONG) map_address) == -1) base_address = 0UL; + + if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + + } while ((BLASLONG)map_address == -1); + +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif + newmemory[position-NUM_BUFFERS].addr = map_address; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif + +//#ifdef DEBUG + printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position); +//#endif + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos; + +#endif + return (void *)newmemory[position-NUM_BUFFERS].addr; + +terminate: printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); @@ -2907,13 +3031,28 @@ void blas_memory_free(void *free_area){ while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) position++; - if (position >= NUM_BUFFERS) goto error; + if (position >= NUM_BUFFERS && !memory_overflowed) goto error; #ifdef DEBUG if (memory[position].addr != free_area) goto error; printf(" Position : %d\n", position); #endif + if (memory_overflowed) { + while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area)) + position++; + // arm: ensure all writes are finished before other thread takes this memory + WMB; + newmemory[position].used = 0; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif + +//#ifdef DEBUG + printf("Unmap from overflow area succeeded.\n\n"); +//#endif + return; +} else { // arm: ensure all writes are finished before other thread takes this memory WMB; @@ -2927,7 +3066,7 @@ void blas_memory_free(void *free_area){ #endif return; - +} error: printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); From b4b952eece8344fe5d7adf2352791ab81d0d1d8d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 28 Aug 2021 17:03:53 +0200 Subject: [PATCH 073/143] Add auxiliary tracking space for thread buffer frees too --- driver/others/memory.c | 68 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 377e073ee..d4fdfa465 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2060,6 +2060,7 @@ struct release_t { int hugetlb_allocated = 0; static struct release_t release_info[NUM_BUFFERS]; +static struct release_t *new_release_info; static int release_pos = 0; #if defined(OS_LINUX) && !defined(NO_WARMUP) @@ -2110,8 +2111,13 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; + } release_pos ++; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); @@ -2274,8 +2280,13 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; + { else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; + } release_pos ++; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); @@ -2307,8 +2318,13 @@ static void *alloc_malloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_malloc_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_malloc_free; + } release_pos ++; } @@ -2341,8 +2357,13 @@ static void *alloc_qalloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_qalloc_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_qalloc_free; + } release_pos ++; } @@ -2370,8 +2391,13 @@ static void *alloc_windows(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_windows_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_windows_free; + } release_pos ++; } @@ -2414,9 +2440,15 @@ static void *alloc_devicedirver(void *address){ fd, 0); if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_devicedirver_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].attr = fd; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_devicedirver_free; + } release_pos ++; } @@ -2450,9 +2482,15 @@ static void *alloc_shm(void *address){ shmctl(shmid, IPC_RMID, 0); + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].attr = shmid; release_info[release_pos].func = alloc_shm_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].attr = shmid; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_shm_free; + } release_pos ++; } @@ -2556,8 +2594,13 @@ static void *alloc_hugetlb(void *address){ #endif if (map_address != (void *)-1){ + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_hugetlb_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlb_free; + } release_pos ++; } @@ -2604,9 +2647,15 @@ static void *alloc_hugetlbfile(void *address){ fd, 0); if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_hugetlbfile_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].attr = fd; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlbfile_free; + } release_pos ++; } @@ -2663,6 +2712,8 @@ static int memory_overflowed = 0; void *blas_memory_alloc(int procpos){ + int i; + int position; #if defined(WHEREAMI) && !defined(USE_OPENMP) int mypos = 0; @@ -2926,8 +2977,9 @@ void *blas_memory_alloc(int procpos){ if (memory_overflowed) goto terminate; printf("num_buffers exceeded, adding auxiliary array\n"); memory_overflowed=1; - newmemory= (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); - for (int i=0;i<512;i++) { + new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); + newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); + for (i = 0; i < 512; i++) { newmemory[i].addr = (void *)0; #if defined(WHEREAMI) && !defined(USE_OPENMP) newmemory[i].pos = -1; @@ -3101,7 +3153,10 @@ void blas_shutdown(void){ LOCK_COMMAND(&alloc_lock); for (pos = 0; pos < release_pos; pos ++) { + if (pos < NUM_BUFFERS) release_info[pos].func(&release_info[pos]); + else + new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]); } #ifdef SEEK_ADDRESS @@ -3118,6 +3173,15 @@ void blas_shutdown(void){ #endif memory[pos].lock = 0; } + if (memory_overflowed) + for (pos = 0; pos < 512; pos ++){ + newmemory[pos].addr = (void *)0; + newmemory[pos].used = 0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + newmemory[pos].pos = -1; +#endif + newmemory[pos].lock = 0; + } UNLOCK_COMMAND(&alloc_lock); From 2ba9a567aaaac875be19a76009853b2ee4597dbc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 28 Aug 2021 17:14:59 +0200 Subject: [PATCH 074/143] Fix typo --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index d4fdfa465..3825e83ae 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2283,7 +2283,7 @@ static void *alloc_mmap(void *address){ if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; - { else { + } else { new_release_info[release_pos-NUM_BUFFERS].address = map_address; new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; } From 7fd12a5e69164b62dad7fbddf1581d941e5339fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Aug 2021 13:54:51 +0200 Subject: [PATCH 075/143] Add likely() hints for gcc --- driver/others/memory.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 3825e83ae..689aba942 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,6 +73,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +#ifndef likely +#ifdef __GNUC__ +#define likely(x) __builtin_expect(!!(x), 1) +#else +#define likely(x) (x) +#endif +#endif + #if defined(USE_TLS) && defined(SMP) #define COMPILE_TLS @@ -2111,7 +2119,7 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; } else { @@ -2280,7 +2288,7 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; } else { @@ -2318,7 +2326,7 @@ static void *alloc_malloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_malloc_free; } else { @@ -2357,7 +2365,7 @@ static void *alloc_qalloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_qalloc_free; } else { @@ -2391,7 +2399,7 @@ static void *alloc_windows(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_windows_free; } else { @@ -2440,7 +2448,7 @@ static void *alloc_devicedirver(void *address){ fd, 0); if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_devicedirver_free; @@ -2482,7 +2490,7 @@ static void *alloc_shm(void *address){ shmctl(shmid, IPC_RMID, 0); - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].attr = shmid; release_info[release_pos].func = alloc_shm_free; @@ -2594,7 +2602,7 @@ static void *alloc_hugetlb(void *address){ #endif if (map_address != (void *)-1){ - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_hugetlb_free; } else { @@ -2647,7 +2655,7 @@ static void *alloc_hugetlbfile(void *address){ fd, 0); if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_hugetlbfile_free; @@ -3153,7 +3161,7 @@ void blas_shutdown(void){ LOCK_COMMAND(&alloc_lock); for (pos = 0; pos < release_pos; pos ++) { - if (pos < NUM_BUFFERS) + if (likely(pos < NUM_BUFFERS)) release_info[pos].func(&release_info[pos]); else new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]); From 89fc5b8f4f1c56b50896773e667c3a215342e49c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Aug 2021 19:50:24 +0200 Subject: [PATCH 076/143] Fix unmap logic --- driver/others/memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 689aba942..1f66ef9e9 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -76,8 +76,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef likely #ifdef __GNUC__ #define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) #else #define likely(x) (x) +#define unlikely(x) (x) #endif #endif @@ -3097,7 +3099,7 @@ void blas_memory_free(void *free_area){ if (memory[position].addr != free_area) goto error; printf(" Position : %d\n", position); #endif - if (memory_overflowed) { + if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) { while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area)) position++; // arm: ensure all writes are finished before other thread takes this memory From 1d83ca4bca890536f1c7713a3432a9daf59d2c2c Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 12 Aug 2021 03:14:18 +0000 Subject: [PATCH 077/143] Small Matrix: support BFLOAT16 data type --- common_level3.h | 12 ++++ common_macro.h | 18 ++--- common_param.h | 13 ++++ common_sb.h | 12 ++++ interface/gemm.c | 6 +- kernel/Makefile.L3 | 75 ++++++++++++++++++++ kernel/generic/gemm_small_matrix_kernel_nn.c | 4 +- kernel/generic/gemm_small_matrix_kernel_nt.c | 4 +- kernel/generic/gemm_small_matrix_kernel_tn.c | 4 +- kernel/generic/gemm_small_matrix_kernel_tt.c | 4 +- kernel/setparam-ref.c | 5 ++ 11 files changed, 137 insertions(+), 20 deletions(-) diff --git a/common_level3.h b/common_level3.h index 187402a9a..5080ada10 100644 --- a/common_level3.h +++ b/common_level3.h @@ -516,6 +516,13 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xd #endif #ifdef SMALL_MATRIX_OPT +int sbgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); + +int sbgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sbgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sbgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sbgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); @@ -530,6 +537,11 @@ int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLO int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +int sbgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sbgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sbgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sbgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); + int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); diff --git a/common_macro.h b/common_macro.h index aeb9a205b..cf2a3fd88 100644 --- a/common_macro.h +++ b/common_macro.h @@ -942,17 +942,17 @@ #define GEADD_K SGEADD_K -#define GEMM_SMALL_MATRIX_PERMIT SGEMM_SMALL_MATRIX_PERMIT +#define GEMM_SMALL_MATRIX_PERMIT SBGEMM_SMALL_MATRIX_PERMIT -#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN -#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT -#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN -#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_NN SBGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT SBGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_TN SBGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT SBGEMM_SMALL_KERNEL_TT -#define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN -#define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT -#define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN -#define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT +#define GEMM_SMALL_KERNEL_B0_NN SBGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT SBGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_TN SBGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT SBGEMM_SMALL_KERNEL_B0_TT #endif diff --git a/common_param.h b/common_param.h index 7e8bea4fe..31fba9059 100644 --- a/common_param.h +++ b/common_param.h @@ -145,6 +145,19 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#ifdef SMALL_MATRIX_OPT + int (*sbgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); + + int (*sbgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + + int (*sbgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); +#endif #endif #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) diff --git a/common_sb.h b/common_sb.h index 9976e812e..d21e7a563 100644 --- a/common_sb.h +++ b/common_sb.h @@ -24,6 +24,7 @@ #define SBGEMM_BETA sbgemm_beta #define SBGEMM_KERNEL sbgemm_kernel +#define SBGEMM_SMALL_MATRIX_PERMIT sbgemm_small_matrix_permit #else #define SBDOT_K gotoblas -> sbdot_k @@ -41,8 +42,19 @@ #define SBGEMM_BETA gotoblas -> sbgemm_beta #define SBGEMM_KERNEL gotoblas -> sbgemm_kernel +#define SBGEMM_SMALL_MATRIX_PERMIT gotoblas -> sbgemm_small_matrix_permit #endif +#define SBGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sbgemm_small_kernel_nn) +#define SBGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sbgemm_small_kernel_nt) +#define SBGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sbgemm_small_kernel_tn) +#define SBGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sbgemm_small_kernel_tt) + +#define SBGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sbgemm_small_kernel_b0_nn) +#define SBGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sbgemm_small_kernel_b0_nt) +#define SBGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sbgemm_small_kernel_b0_tn) +#define SBGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sbgemm_small_kernel_b0_tt) + #define SBGEMM_NN sbgemm_nn #define SBGEMM_CN sbgemm_tn #define SBGEMM_TN sbgemm_tn diff --git a/interface/gemm.c b/interface/gemm.c index 3497d8651..47e0ca0c3 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -105,7 +105,7 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #endif }; -#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) && !defined(BFLOAT16) +#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) #define USE_SMALL_MATRIX_OPT 1 #else #define USE_SMALL_MATRIX_OPT 0 @@ -131,8 +131,8 @@ static size_t gemm_small_kernel_b0[] = { GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0, }; -#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx)) -#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx)) +#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx)) +#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx)) #else static size_t zgemm_small_kernel[] = { diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index ef11e391c..404f774cc 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -450,6 +450,15 @@ endif ###### BLAS small matrix optimization ##### ifeq ($(SMALL_MATRIX_OPT), 1) +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += \ + sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) +endif + SBLASOBJS += \ sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ @@ -4424,6 +4433,72 @@ $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +ifeq ($(BUILD_BFLOAT16), 1) +ifndef SBGEMM_SMALL_M_PERMIT +SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif + +ifndef SBGEMM_SMALL_K_NN +SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SBGEMM_SMALL_K_NT +SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SBGEMM_SMALL_K_TN +SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SBGEMM_SMALL_K_TT +SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +ifndef SBGEMM_SMALL_K_B0_NN +SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c +endif + +ifndef SBGEMM_SMALL_K_B0_NT +SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c +endif + +ifndef SBGEMM_SMALL_K_B0_TN +SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c +endif + +ifndef SBGEMM_SMALL_K_B0_TT +SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c +endif + +$(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ +endif + ifndef CGEMM_SMALL_M_PERMIT CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c endif diff --git a/kernel/generic/gemm_small_matrix_kernel_nn.c b/kernel/generic/gemm_small_matrix_kernel_nn.c index 71700a1fa..b0638c7ea 100644 --- a/kernel/generic/gemm_small_matrix_kernel_nn.c +++ b/kernel/generic/gemm_small_matrix_kernel_nn.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/generic/gemm_small_matrix_kernel_nt.c b/kernel/generic/gemm_small_matrix_kernel_nt.c index b287b3837..0a965db58 100644 --- a/kernel/generic/gemm_small_matrix_kernel_nt.c +++ b/kernel/generic/gemm_small_matrix_kernel_nt.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/generic/gemm_small_matrix_kernel_tn.c b/kernel/generic/gemm_small_matrix_kernel_tn.c index c41ea7211..69ffc718c 100644 --- a/kernel/generic/gemm_small_matrix_kernel_tn.c +++ b/kernel/generic/gemm_small_matrix_kernel_tn.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/generic/gemm_small_matrix_kernel_tt.c b/kernel/generic/gemm_small_matrix_kernel_tt.c index 734510c67..9d68de3f9 100644 --- a/kernel/generic/gemm_small_matrix_kernel_tt.c +++ b/kernel/generic/gemm_small_matrix_kernel_tt.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index f303d0dc6..19b7b5f0b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -112,6 +112,11 @@ gotoblas_t TABLE_NAME = { #else NULL,NULL, #endif +#ifdef SMALL_MATRIX_OPT + sbgemm_small_matrix_permitTS, + sbgemm_small_kernel_nnTS, sbgemm_small_kernel_ntTS, sbgemm_small_kernel_tnTS, sbgemm_small_kernel_ttTS, + sbgemm_small_kernel_b0_nnTS, sbgemm_small_kernel_b0_ntTS, sbgemm_small_kernel_b0_tnTS, sbgemm_small_kernel_b0_ttTS, +#endif #endif #if ( BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) From 7d27b182fc6cb2d1b8fc7967c40dd89727fcf875 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 12 Aug 2021 06:10:51 +0000 Subject: [PATCH 078/143] sbgemm: cooperlake: enable SBGEMM by small matrix path --- kernel/x86_64/KERNEL.COOPERLAKE | 10 +++ .../x86_64/sbgemm_block_microk_cooperlake.c | 19 +--- .../sbgemm_microk_cooperlake_template.c | 5 +- .../sbgemm_small_kernel_b0_nn_cooperlake.c | 2 + .../sbgemm_small_kernel_b0_nt_cooperlake.c | 2 + .../sbgemm_small_kernel_b0_tn_cooperlake.c | 2 + .../sbgemm_small_kernel_b0_tt_cooperlake.c | 2 + .../sbgemm_small_kernel_nn_cooperlake.c | 2 + .../sbgemm_small_kernel_nt_cooperlake.c | 2 + .../sbgemm_small_kernel_permit_cooperlake.c | 42 +++++++++ .../sbgemm_small_kernel_template_cooperlake.c | 89 +++++++++++++++++++ .../sbgemm_small_kernel_tn_cooperlake.c | 2 + .../sbgemm_small_kernel_tt_cooperlake.c | 2 + 13 files changed, 162 insertions(+), 19 deletions(-) create mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index 0b2f3c0ed..151c02d5a 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -1 +1,11 @@ include $(KERNELDIR)/KERNEL.SKYLAKEX + +SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c +SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c +SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_b0_nn_cooperlake.c +SBGEMM_SMALL_K_NT = sbgemm_small_kernel_nt_cooperlake.c +SBGEMM_SMALL_K_B0_NT = sbgemm_small_kernel_b0_nt_cooperlake.c +SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c +SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_b0_tn_cooperlake.c +SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c +SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_b0_tt_cooperlake.c diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c index 147c5ebdd..2c27221ac 100644 --- a/kernel/x86_64/sbgemm_block_microk_cooperlake.c +++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c @@ -1,6 +1,5 @@ -//#include "sbgemm.h" - #include + // Walk around those intrinsics that missed by compiler #define MM256_LOADU_EPI16(addr) \ _mm256_maskz_loadu_epi16(~0, (addr)) @@ -1747,7 +1746,7 @@ void COL_MAJOR_OTCOPY_KERNEL_Kx8m(BLASLONG k, BLASLONG n, bfloat16 * B, BLASLONG } // Scale matrix C when beta is not ZERO or ONE -void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc) +void sbgemm_scal_operation(BLASLONG M, BLASLONG N, float beta, float *C, BLASLONG ldc) { float * C_addr0 = C; float * C_addr1 = C + ldc; @@ -1759,12 +1758,6 @@ void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST __m512 array_512_0, array_512_1, array_512_2, array_512_3; __m512 BETAVECTOR = _mm512_set1_ps(beta); - if (Order == CblasRowMajor) { - blasint tmp = M; - M = N; - N = tmp; - } - BLASLONG tag_n_Nx = N & (~3); BLASLONG tag_n_Mx = M & (~15); unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); @@ -1828,7 +1821,7 @@ void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST } // Zero C matrix when Beta is 0 -void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, float *C, OPENBLAS_CONST blasint ldc) +void sbgemm_zero_operation(BLASLONG M, BLASLONG N, float *C, BLASLONG ldc) { float * C_addr0 = C; float * C_addr1 = C + ldc; @@ -1839,12 +1832,6 @@ void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST __m512 ZEROVECTOR = _mm512_setzero_ps(); - if (Order == CblasRowMajor) { - blasint tmp = M; - M = N; - N = tmp; - } - BLASLONG tag_n_Nx = N & (~3); BLASLONG tag_n_Mx = M & (~15); unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c index c71595813..b8ed9838e 100644 --- a/kernel/x86_64/sbgemm_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c @@ -1,8 +1,6 @@ -#include "sbgemm.h" #include "bf16_common_macros.h" #include -/* These macros are needed and should be placed at the right place #define BF16_BLOCK_STEP_N 8 #define BF16_BLOCK_THRES_K 1024 #define BF16_BLOCK_THRES_M 32 @@ -14,7 +12,6 @@ #define ONE 1.e0f #define ZERO 0.e0f -*/ #undef STORE16_COMPLETE_RESULT #undef STORE16_MASK_COMPLETE_RESULT @@ -1798,6 +1795,7 @@ void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha, } /* ----------------------------------------- End of TT kernels --------------------------------------- */ +/* #ifndef ONE_ALPHA // ALPHA is not ONE void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) @@ -1836,3 +1834,4 @@ void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_ } } } +*/ diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c new file mode 100644 index 000000000..373457f84 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sbgemm_small_kernel_nn_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c new file mode 100644 index 000000000..0b840c248 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sbgemm_small_kernel_nt_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c new file mode 100644 index 000000000..67542b69c --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sbgemm_small_kernel_tn_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c new file mode 100644 index 000000000..17b5b41c5 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sbgemm_small_kernel_tt_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c new file mode 100644 index 000000000..ec40a5054 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c @@ -0,0 +1,2 @@ +#define TRANS_NN +#include "sbgemm_small_kernel_template_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c new file mode 100644 index 000000000..1cdfd2936 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c @@ -0,0 +1,2 @@ +#define TRANS_NT +#include "sbgemm_small_kernel_template_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c new file mode 100644 index 000000000..823aafbdd --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include "sbgemm_block_microk_cooperlake.c" +// Define micro kernels for ALPHA not ONE scenarios +#undef ONE_ALPHA +#include "sbgemm_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA as ONE scenarios +#define ONE_ALPHA 1 +#include "sbgemm_microk_cooperlake_template.c" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + return 1; +} diff --git a/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c new file mode 100644 index 000000000..d328b0981 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c @@ -0,0 +1,89 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +extern void sbgemm_scal_operation(BLASLONG M, BLASLONG N, float beta, float *C, BLASLONG ldc); +extern void sbgemm_zero_operation(BLASLONG M, BLASLONG N, float *C, BLASLONG ldc); + +extern void sbgemm_blocking_kernel_nn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_nn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_nt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_nt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_tn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_tn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_tt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); + +#if defined(TRANS_NN) +#define SBGEMM_BLOCKING_KERNEL_ONE sbgemm_blocking_kernel_nn_one +#define SBGEMM_BLOCKING_KERNEL_ALPHA sbgemm_blocking_kernel_nn_alpha +#elif defined(TRANS_NT) +#define SBGEMM_BLOCKING_KERNEL_ONE sbgemm_blocking_kernel_nt_one +#define SBGEMM_BLOCKING_KERNEL_ALPHA sbgemm_blocking_kernel_nt_alpha +#elif defined(TRANS_TN) +#define SBGEMM_BLOCKING_KERNEL_ONE sbgemm_blocking_kernel_tn_one +#define SBGEMM_BLOCKING_KERNEL_ALPHA sbgemm_blocking_kernel_tn_alpha +#elif defined(TRANS_TT) +#define SBGEMM_BLOCKING_KERNEL_ONE sbgemm_blocking_kernel_tt_one +#define SBGEMM_BLOCKING_KERNEL_ALPHA sbgemm_blocking_kernel_tt_alpha +#endif + +#define BF16_BLOCK_THRES_K 1024 +// If we want to adjust this to be bigger, need to change COL_MAJOR_INCOPY_KERNEL_Kx32 kernel to be bigger also +#define BF16_BLOCK_THRES_M 32 +#define BF16_BLOCK_THRES_N 1024 + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + bfloat16 * block_A; + bfloat16 * block_B; + + block_A = (bfloat16 *) malloc(sizeof(bfloat16) * BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M); + block_B = (bfloat16 *) malloc(sizeof(bfloat16) * BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K); + +#if defined(B0) + sbgemm_zero_operation(M, N, C, ldc); +#else + sbgemm_scal_operation(M, N, beta, C, ldc); +#endif + + if (alpha == ONE) { + SBGEMM_BLOCKING_KERNEL_ONE(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } else { + SBGEMM_BLOCKING_KERNEL_ALPHA(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } + free(block_A); + free(block_B); + return 0; +} diff --git a/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c new file mode 100644 index 000000000..f1a0d0d0c --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c @@ -0,0 +1,2 @@ +#define TRANS_TN +#include "sbgemm_small_kernel_template_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c new file mode 100644 index 000000000..8a2a597bc --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c @@ -0,0 +1,2 @@ +#define TRANS_TT +#include "sbgemm_small_kernel_template_cooperlake.c" From 2e44ca0136da2829e1c2e65e2cdd4a8d540491a8 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 13 Aug 2021 00:51:24 +0800 Subject: [PATCH 079/143] sbgemm: add missing cblas_sbgemm definition --- cblas.h | 2 ++ interface/gemm.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cblas.h b/cblas.h index f0220eb99..a5ad25ad7 100644 --- a/cblas.h +++ b/cblas.h @@ -400,6 +400,8 @@ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); +void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/interface/gemm.c b/interface/gemm.c index 47e0ca0c3..6dcc54041 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -273,8 +273,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS blasint m, blasint n, blasint k, #ifndef COMPLEX FLOAT alpha, - FLOAT *a, blasint lda, - FLOAT *b, blasint ldb, + IFLOAT *a, blasint lda, + IFLOAT *b, blasint ldb, FLOAT beta, FLOAT *c, blasint ldc) { #else From f39301935c27e34acbf95757e644ba6e3ce95cef Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 13 Aug 2021 18:43:41 +0800 Subject: [PATCH 080/143] sbgemm: cooperlake: make sure hot buffer aligned to 64 --- .../sbgemm_small_kernel_template_cooperlake.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c index d328b0981..1ab7a34ab 100644 --- a/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c +++ b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c @@ -59,6 +59,10 @@ extern void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float #define BF16_BLOCK_THRES_M 32 #define BF16_BLOCK_THRES_N 1024 +#define MALLOC_ALIGN64(ptr, size, raw_ptr) \ + raw_ptr = malloc((size) + 63); \ + ptr = (bfloat16 *)(((uintptr_t) raw_ptr + 63) & ~(uintptr_t)63) + #if defined(B0) int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) @@ -68,9 +72,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT al { bfloat16 * block_A; bfloat16 * block_B; + void* raw_ptrA; + void* raw_ptrB; - block_A = (bfloat16 *) malloc(sizeof(bfloat16) * BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M); - block_B = (bfloat16 *) malloc(sizeof(bfloat16) * BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K); + MALLOC_ALIGN64(block_A, sizeof(bfloat16) * BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M, raw_ptrA); + MALLOC_ALIGN64(block_B, sizeof(bfloat16) * BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K, raw_ptrB); #if defined(B0) sbgemm_zero_operation(M, N, C, ldc); @@ -83,7 +89,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT al } else { SBGEMM_BLOCKING_KERNEL_ALPHA(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); } - free(block_A); - free(block_B); + + free(raw_ptrA); + free(raw_ptrB); return 0; } From 619588fbabaa0ee470487b9afd063541e95c486b Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Mon, 30 Aug 2021 17:48:11 +0800 Subject: [PATCH 081/143] sbgemm: remove unnecessary b0 files --- kernel/Makefile.L3 | 16 ++++++++-------- kernel/generic/gemm_small_matrix_kernel_nn.c | 4 ++-- kernel/generic/gemm_small_matrix_kernel_nt.c | 4 ++-- kernel/generic/gemm_small_matrix_kernel_tn.c | 4 ++-- kernel/generic/gemm_small_matrix_kernel_tt.c | 4 ++-- kernel/x86_64/KERNEL.COOPERLAKE | 8 ++++---- .../sbgemm_small_kernel_b0_nn_cooperlake.c | 2 -- .../sbgemm_small_kernel_b0_nt_cooperlake.c | 2 -- .../sbgemm_small_kernel_b0_tn_cooperlake.c | 2 -- .../sbgemm_small_kernel_b0_tt_cooperlake.c | 2 -- 10 files changed, 20 insertions(+), 28 deletions(-) delete mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c delete mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c delete mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c delete mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 404f774cc..49b7c78fb 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -4471,32 +4471,32 @@ $(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ ifndef SBGEMM_SMALL_K_B0_NN -SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c +SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c endif ifndef SBGEMM_SMALL_K_B0_NT -SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c +SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c endif ifndef SBGEMM_SMALL_K_B0_TN -SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c +SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c endif ifndef SBGEMM_SMALL_K_B0_TT -SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c +SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c endif $(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ endif ifndef CGEMM_SMALL_M_PERMIT diff --git a/kernel/generic/gemm_small_matrix_kernel_nn.c b/kernel/generic/gemm_small_matrix_kernel_nn.c index b0638c7ea..543e7e047 100644 --- a/kernel/generic/gemm_small_matrix_kernel_nn.c +++ b/kernel/generic/gemm_small_matrix_kernel_nn.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/generic/gemm_small_matrix_kernel_nt.c b/kernel/generic/gemm_small_matrix_kernel_nt.c index 0a965db58..d4a7aec6a 100644 --- a/kernel/generic/gemm_small_matrix_kernel_nt.c +++ b/kernel/generic/gemm_small_matrix_kernel_nt.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/generic/gemm_small_matrix_kernel_tn.c b/kernel/generic/gemm_small_matrix_kernel_tn.c index 69ffc718c..2747337f2 100644 --- a/kernel/generic/gemm_small_matrix_kernel_tn.c +++ b/kernel/generic/gemm_small_matrix_kernel_tn.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/generic/gemm_small_matrix_kernel_tt.c b/kernel/generic/gemm_small_matrix_kernel_tt.c index 9d68de3f9..eec926bc7 100644 --- a/kernel/generic/gemm_small_matrix_kernel_tt.c +++ b/kernel/generic/gemm_small_matrix_kernel_tt.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index 151c02d5a..6272dd73d 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -2,10 +2,10 @@ include $(KERNELDIR)/KERNEL.SKYLAKEX SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c -SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_b0_nn_cooperlake.c +SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c SBGEMM_SMALL_K_NT = sbgemm_small_kernel_nt_cooperlake.c -SBGEMM_SMALL_K_B0_NT = sbgemm_small_kernel_b0_nt_cooperlake.c +SBGEMM_SMALL_K_B0_NT = sbgemm_small_kernel_nt_cooperlake.c SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c -SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_b0_tn_cooperlake.c +SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_tn_cooperlake.c SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c -SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_b0_tt_cooperlake.c +SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c deleted file mode 100644 index 373457f84..000000000 --- a/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sbgemm_small_kernel_nn_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c deleted file mode 100644 index 0b840c248..000000000 --- a/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sbgemm_small_kernel_nt_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c deleted file mode 100644 index 67542b69c..000000000 --- a/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sbgemm_small_kernel_tn_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c deleted file mode 100644 index 17b5b41c5..000000000 --- a/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sbgemm_small_kernel_tt_cooperlake.c" From 2db1a99aca0177761f47daa71b27450923eb127e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Aug 2021 14:21:25 +0200 Subject: [PATCH 082/143] Clean up debug messages --- driver/others/memory.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 1f66ef9e9..c560c4e90 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2985,7 +2985,7 @@ void *blas_memory_alloc(int procpos){ error: if (memory_overflowed) goto terminate; - printf("num_buffers exceeded, adding auxiliary array\n"); + fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n") memory_overflowed=1; new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); @@ -3057,9 +3057,9 @@ allocation2: UNLOCK_COMMAND(&alloc_lock); #endif -//#ifdef DEBUG +#ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position); -//#endif +#endif #if defined(WHEREAMI) && !defined(USE_OPENMP) @@ -3110,9 +3110,9 @@ void blas_memory_free(void *free_area){ UNLOCK_COMMAND(&alloc_lock); #endif -//#ifdef DEBUG +#ifdef DEBUG printf("Unmap from overflow area succeeded.\n\n"); -//#endif +#endif return; } else { // arm: ensure all writes are finished before other thread takes this memory From cd10d1c03be5ecbdf8bda6e448a6cac27f8aa1be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Aug 2021 14:38:28 +0200 Subject: [PATCH 083/143] Fix typo --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index c560c4e90..48067923e 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2985,7 +2985,7 @@ void *blas_memory_alloc(int procpos){ error: if (memory_overflowed) goto terminate; - fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n") + fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); memory_overflowed=1; new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); From d1ed72fa87b2c1cdefed4b34682e719a9b326a8c Mon Sep 17 00:00:00 2001 From: Niyas Sait Date: Tue, 24 Aug 2021 06:09:29 +0100 Subject: [PATCH 084/143] [win/arm64]: Explicit casting for GMEMM_DEFAULT_ALIGN to create 64-bit value Win64 uses LLP64 datamodel and unsigned long is only 32-bit. For 64-bit architecture we need 64-bit mask to correctly generate address --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 634e0ef5d..5250b2f39 100644 --- a/param.h +++ b/param.h @@ -2955,7 +2955,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASULONG)0x03fffUL #define SYMV_P 16 From 7cddbf99b1dd9f99203daf9430c5d87f4eac6b56 Mon Sep 17 00:00:00 2001 From: Niyas Sait Date: Tue, 31 Aug 2021 14:36:44 +0100 Subject: [PATCH 085/143] Make explicit conversion condition on _WIN64 flag --- param.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/param.h b/param.h index 5250b2f39..07397a66e 100644 --- a/param.h +++ b/param.h @@ -2955,7 +2955,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 +#ifdef _WIN64 +/* Use explicit casting for win64 as LLP64 datamodel is used */ #define GEMM_DEFAULT_ALIGN (BLASULONG)0x03fffUL +#else +#define GEMM_DEFAULT_ALIGN 0x03fffUL +#endif #define SYMV_P 16 From f1e33059746c1fc3a4df76f524c1d4f37f9665b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 1 Sep 2021 21:36:50 +0200 Subject: [PATCH 086/143] Add workaround for Windows10 macro name clash --- kernel/Makefile.L3 | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 49b7c78fb..2d274d33b 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -4544,7 +4544,7 @@ $(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ $(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@ $(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ @@ -4556,7 +4556,7 @@ $(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@ $(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ @@ -4608,7 +4608,7 @@ $(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC -DB0 $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ @@ -4620,7 +4620,7 @@ $(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR -DB0 $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ @@ -4680,7 +4680,7 @@ $(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ $(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@ $(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ @@ -4692,7 +4692,7 @@ $(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@ $(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ @@ -4744,7 +4744,7 @@ $(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC -DB0 $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ @@ -4756,7 +4756,7 @@ $(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR -DB0 $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ From af19cda65aef4d033ae33213013c88b0a99f9da2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 Sep 2021 18:26:59 +0200 Subject: [PATCH 087/143] Add "recursive" option for IBM xlf compiler (#3359) * Add correct "recursive" option for xlf (from reference-lapack issue 606) --- Makefile.power | 12 ++++++++++++ cmake/fc.cmake | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Makefile.power b/Makefile.power index 946f55232..4e7478213 100644 --- a/Makefile.power +++ b/Makefile.power @@ -12,9 +12,13 @@ endif ifeq ($(CORE), POWER10) ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +ifeq ($(F_COMPILER, IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif endif +endif ifeq ($(CORE), POWER9) ifneq ($(C_COMPILER), PGI) @@ -33,7 +37,11 @@ else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O2 -frecursive -fno-fast-math +endif ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) @@ -57,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) ifeq ($(OSNAME), AIX) +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math +endif else FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 631664569..f7aa4c5c9 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -104,7 +104,7 @@ endif () if (${F_COMPILER} STREQUAL "IBM") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") - # FCOMMON_OPT += -qarch=440 + set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur") if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -q64") if (INTERFACE64) From 72f3ce5f084c40006e4548ec2a0de2751f5d2dd9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 5 Sep 2021 20:35:48 +0200 Subject: [PATCH 088/143] Add NO_AVX=1 fallbacks to newer generation x86_64 for completeness (#3360) * Add NO_AVX=1 fallbacks to newer generation x86_64 for completeness * Update .travis.yml --- .travis.yml | 2 +- getarch.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 78 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8657b64f4..8a3d2e5bb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,6 @@ # XXX: Precise is already deprecated, new default is Trusty. # https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming -dist: precise +dist: focal sudo: true language: c diff --git a/getarch.c b/getarch.c index 6e43616f7..3b08cbfa9 100644 --- a/getarch.c +++ b/getarch.c @@ -313,6 +313,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -322,12 +332,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" #endif +#endif #ifdef FORCE_HASWELL #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -336,6 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" +#endif #else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ @@ -354,6 +376,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif +#else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -363,10 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" +#endif #else -#define FORCE -#define FORCE_INTEL -#define ARCHITECTURE "X86" #define SUBARCHITECTURE "SKYLAKEX" #define ARCHCONFIG "-DSKYLAKEX " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -384,6 +425,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif +#else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -393,10 +455,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" +#endif #else -#define FORCE -#define FORCE_INTEL -#define ARCHITECTURE "X86" #define SUBARCHITECTURE "COOPERLAKE" #define ARCHCONFIG "-DCOOPERLAKE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -564,6 +624,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE_INTEL #define ARCHITECTURE "X86" #ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -572,6 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" +#endif #else #define SUBARCHITECTURE "ZEN" #define ARCHCONFIG "-DZEN " \ From 32fee860330379774a895a18960640120506d317 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 6 Sep 2021 23:44:20 +0200 Subject: [PATCH 089/143] Correct misplaced ifdef lines --- getarch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/getarch.c b/getarch.c index 3b08cbfa9..094feaadd 100644 --- a/getarch.c +++ b/getarch.c @@ -372,10 +372,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_SKYLAKEX -#ifdef NO_AVX512 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX512 #ifdef NO_AVX2 #ifdef NO_AVX #define SUBARCHITECTURE "NEHALEM" @@ -421,10 +421,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_COOPERLAKE -#ifdef NO_AVX512 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX512 #ifdef NO_AVX2 #ifdef NO_AVX #define SUBARCHITECTURE "NEHALEM" From 349fb4910b7ba2069ffe8374c14b06fcf419f7c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 7 Sep 2021 11:19:51 +0200 Subject: [PATCH 090/143] Disable the remaining x86_64 job on Travis --- .travis.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8a3d2e5bb..3dc5fe290 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,24 +7,24 @@ language: c matrix: include: - &test-ubuntu - os: linux +# os: linux compiler: gcc addons: apt: packages: - gfortran - before_script: &common-before - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - script: - - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - - make -C test $COMMON_FLAGS $BTYPE - - make -C ctest $COMMON_FLAGS $BTYPE - - make -C utest $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64" - - - <<: *test-ubuntu +# before_script: &common-before +# - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" +# script: +# - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# - make -C test $COMMON_FLAGS $BTYPE +# - make -C ctest $COMMON_FLAGS $BTYPE +# - make -C utest $COMMON_FLAGS $BTYPE +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64" +# +# - <<: *test-ubuntu os: linux-ppc64le before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" From 8c68b6f26d1030f2bb932d8b885cb8d076a84437 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 7 Sep 2021 11:40:40 +0200 Subject: [PATCH 091/143] Update .travis.yml --- .travis.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3dc5fe290..85a57f6e3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,8 +26,13 @@ matrix: # # - <<: *test-ubuntu os: linux-ppc64le - before_script: + before_script: &common-before - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" + script: + - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - make -C test $COMMON_FLAGS $BTYPE + - make -C ctest $COMMON_FLAGS $BTYPE + - make -C utest $COMMON_FLAGS $BTYPE env: # for matrix annotation only - TARGET_BOX=PPC64LE_LINUX From 4c294336e6bc1b249721c0d9f0ee210d010db9f9 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 10 Aug 2021 03:23:45 +0000 Subject: [PATCH 092/143] sbgemm: cooperlake: add dummy source files --- kernel/x86_64/KERNEL.COOPERLAKE | 11 +++++++ kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c | 32 ++++++++++++++++++ kernel/x86_64/sbgemm_ncopy_32_cooperlake.c | 33 +++++++++++++++++++ kernel/x86_64/sbgemm_ncopy_8_cooperlake.c | 33 +++++++++++++++++++ kernel/x86_64/sbgemm_tcopy_32_cooperlake.c | 33 +++++++++++++++++++ kernel/x86_64/sbgemm_tcopy_8_cooperlake.c | 33 +++++++++++++++++++ kernel/x86_64/sgemm_beta_skylakex.c | 2 +- 7 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_ncopy_32_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_ncopy_8_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_tcopy_32_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_tcopy_8_cooperlake.c diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index 6272dd73d..197907261 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -9,3 +9,14 @@ SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_tn_cooperlake.c SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c + +SBGEMM_BETA = sgemm_beta_skylakex.c +SBGEMMKERNEL = sbgemm_kernel_32x8_cooperlake.c +SBGEMMINCOPY = sbgemm_ncopy_32_cooperlake.c +SBGEMMITCOPY = sbgemm_tcopy_32_cooperlake.c +SBGEMMONCOPY = sbgemm_ncopy_8_cooperlake.c +SBGEMMOTCOPY = sbgemm_tcopy_8_cooperlake.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c b/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c new file mode 100644 index 000000000..ea2600067 --- /dev/null +++ b/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c @@ -0,0 +1,32 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +{ +} diff --git a/kernel/x86_64/sbgemm_ncopy_32_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_32_cooperlake.c new file mode 100644 index 000000000..afcf6f647 --- /dev/null +++ b/kernel/x86_64/sbgemm_ncopy_32_cooperlake.c @@ -0,0 +1,33 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + +} diff --git a/kernel/x86_64/sbgemm_ncopy_8_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_8_cooperlake.c new file mode 100644 index 000000000..afcf6f647 --- /dev/null +++ b/kernel/x86_64/sbgemm_ncopy_8_cooperlake.c @@ -0,0 +1,33 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + +} diff --git a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c new file mode 100644 index 000000000..afcf6f647 --- /dev/null +++ b/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c @@ -0,0 +1,33 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + +} diff --git a/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c new file mode 100644 index 000000000..afcf6f647 --- /dev/null +++ b/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c @@ -0,0 +1,33 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + +} diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 1c29c1168..6217acf48 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -41,7 +41,7 @@ #include int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, - FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, BLASLONG ldc){ BLASLONG i, j; From ef8f5fecc8f532081eb63ded20da650b57e78e54 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 10 Aug 2021 06:14:45 +0000 Subject: [PATCH 093/143] sbgemm: cooperlake: implement sbgemm_tcopy_32 --- kernel/x86_64/sbgemm_tcopy_32_cooperlake.c | 108 +++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c index afcf6f647..3e37473ca 100644 --- a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c +++ b/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c @@ -26,8 +26,116 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include +#include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + IFLOAT *boffset; + + boffset = b; + + BLASLONG n32 = n & ~31; + BLASLONG m4 = m & ~3; + BLASLONG m2 = m & ~1; + + uint32_t permute_table = { + 0, 0x10|0, 1, 0x10|1, 2, 0x10|2, 3, 0x10|3, 4, 0x10|4, 5, 0x10|5, 6, 0x10|6, 7, 0x10, 7, + 8, 0x10|8, 9, 0x10|9, 10, 0x10|10, 11, 0x10|11, 12, 0x10|12, 13, 0x10|13, 14, 0x10|14, 15, 0x10|15, + }; + + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); + + for (j = 0; j < n32; j += 32) { + for (i = 0; i < m4; i += 4) { + /* bf16 fma need special memory layout: + * for memory layout like below: + * a00, a01, a02, a03, a04, a05 .... + * a10, a11, a12, a13, a14, a15 .... + * need to copy as: + * a00, a10, a01, a11, a02, a12, a03, a13, ... + */ + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); + __m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]); + __m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + __m512i a10 = _mm512_unpacklo_epi16(a2, a3); + __m512i a11 = _mm512_unpackhi_epi16(a2, a3); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11); + a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11); + + _mm512_storeu_si512(boffset, a0); + _mm512_storeu_si512(boffset + 32, a1); + _mm512_storeu_si512(boffset + 64, a2); + _mm512_storeu_si512(boffset + 96, a3); + boffset += 128; + } + for (; i < m2; i += 2) { + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + + _mm512_storeu_si512(boffset, a0); + _mm512_storeu_si512(boffset + 32, a1); + boffset += 64; + } + for (; i < m; i++) { + /* just copy the only remains row */ + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + _mm512_storeu_si512(boffset, a0); + boffset += 32; + } + } + if (j < n) { + uint32_t remains = n - j; + __mmask32 r_mask = (1UL << remains) - 1; + if (remains > 16) { + __mmask16 w_mask = (1UL << (remains - 16)) - 1; + for (i = 0; i < m2; i += 2) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + + _mm512_storeu_si512(boffset, a0); + _mm512_mask_storeu_epi32(boffset + 32, w_mask, a1); + boffset += 2 * remains; + } + } else { + __mmask16 w_mask = (1UL << remains ) - 1; + for (i = 0; i < m2; i += 2) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + _mm512_mask_storeu_epi32(boffset, w_mask, a0); + boffset += 2 * remains; + } + } + for (; i < m; i++) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + _mm512_mask_storeu_epi16(boffset, r_mask, a0); + boffset += remains; + } + } } From 2ec9f3a8aa67e7b36612bc8faf34397e2a968b27 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 12 Aug 2021 01:46:49 +0000 Subject: [PATCH 094/143] sbgemm: cooperlake: change kernel size to 16x4 --- kernel/x86_64/KERNEL.COOPERLAKE | 10 +- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 126 +++++++++++ kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c | 32 --- ...perlake.c => sbgemm_ncopy_16_cooperlake.c} | 0 kernel/x86_64/sbgemm_ncopy_4_cooperlake.c | 207 ++++++++++++++++++ ...perlake.c => sbgemm_tcopy_16_cooperlake.c} | 73 +++--- ...operlake.c => sbgemm_tcopy_4_cooperlake.c} | 0 kernel/x86_64/sbgemm_tcopy_8_cooperlake.c | 33 --- 8 files changed, 385 insertions(+), 96 deletions(-) create mode 100644 kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c delete mode 100644 kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c rename kernel/x86_64/{sbgemm_ncopy_32_cooperlake.c => sbgemm_ncopy_16_cooperlake.c} (100%) create mode 100644 kernel/x86_64/sbgemm_ncopy_4_cooperlake.c rename kernel/x86_64/{sbgemm_tcopy_32_cooperlake.c => sbgemm_tcopy_16_cooperlake.c} (71%) rename kernel/x86_64/{sbgemm_ncopy_8_cooperlake.c => sbgemm_tcopy_4_cooperlake.c} (100%) delete mode 100644 kernel/x86_64/sbgemm_tcopy_8_cooperlake.c diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index 197907261..dba94aea8 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -11,11 +11,11 @@ SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c SBGEMM_BETA = sgemm_beta_skylakex.c -SBGEMMKERNEL = sbgemm_kernel_32x8_cooperlake.c -SBGEMMINCOPY = sbgemm_ncopy_32_cooperlake.c -SBGEMMITCOPY = sbgemm_tcopy_32_cooperlake.c -SBGEMMONCOPY = sbgemm_ncopy_8_cooperlake.c -SBGEMMOTCOPY = sbgemm_tcopy_8_cooperlake.c +SBGEMMKERNEL = sbgemm_kernel_16x4_cooperlake.c +SBGEMMINCOPY = sbgemm_ncopy_16_cooperlake.c +SBGEMMITCOPY = sbgemm_tcopy_16_cooperlake.c +SBGEMMONCOPY = sbgemm_ncopy_4_cooperlake.c +SBGEMMOTCOPY = sbgemm_tcopy_4_cooperlake.c SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c new file mode 100644 index 000000000..05ba015d2 --- /dev/null +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#define VMOVLDUP(addr, zmm) asm("vmovsldup (%1), %0": "=v"(zmm): "r"(addr)) +#define VMOVHDUP(addr, zmm) asm("vmovshdup (%1), %0": "=v"(zmm): "r"(addr)) +#define BROADCAST64(base, step, n, offset, zmm) \ + if (n == 0) asm("vbroadcastsd %2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \ + else asm("vbroadcastsd %4(%1, %2, %3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2)) + +#define DECLARE_A_PAIR(A) \ + __m512i A_lo_##A; __m512i A_hi_##A; + +#define LOAD_A_PAIR(A) \ + VMOVLDUP(ptr_a##A, A_lo_##A); \ + VMOVHDUP(ptr_a##A, A_hi_##A); + +#define LOAD_A_PAIR_TAIL(A) { \ + __m256i ymm = _mm256_loadu_si256(ptr_a##A); \ + __m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \ + A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \ + A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \ +} + +#define DECLARE_B_PAIR() \ + __m512i B_lo; __m512i B_hi; + +#define BROADCAST_B_PAIR(Bx, By) \ + BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \ + BROADCAST64(ptr_b##Bx, n_blksize, By, 2, B_hi); + +#define BROADCAST_B_PAIR_TAIL(Bx, By) {\ + __m128i xmm = (__m128i) _mm_load_sd(ptr_b##Bx + n_blksize * By); \ + xmm = _mm_cvtepu16_epi32(xmm); \ + B_lo = _mm512_broadcastd_epi32(xmm); \ + B_hi = _mm512_broadcastd_epi32((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ +} + +#define DECLARE_RESULT_4X(A, Bx, By) \ + __m512 result_00_##A##Bx##By = _mm512_setzero_ps(); \ + __m512 result_01_##A##Bx##By = _mm512_setzero_ps(); \ + __m512 result_10_##A##Bx##By = _mm512_setzero_ps(); \ + __m512 result_11_##A##Bx##By = _mm512_setzero_ps(); + +#define FMA(a, b, r) r = _mm512_dpbf16_ps(r, (__m512bh)a, (__m512bh)b) + +#define MATMUL_4X(A, Bx, By) \ + FMA(A_lo_##A, B_lo, result_00_##A##Bx##By); \ + FMA(A_hi_##A, B_lo, result_01_##A##Bx##By); \ + FMA(A_lo_##A, B_hi, result_10_##A##Bx##By); \ + FMA(A_hi_##A, B_hi, result_11_##A##Bx##By); + +#define STORE_4X(A, Bx, By) + + + +int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +{ + IFLOAT *ptr_a = A, *ptr_b = B, *ptr_c = C; + IFLOAT *ptr_b0, *ptr_b1; + IFLOAT *ptr_a0, *ptr_a1; + BLASLONG n_count = n; + BLASLONG m_count, k_count; + BLASLONG n_blksize = 4 * k; + + for (; n_count > 23; n_count -= 24) { + m_count = m; + ptr_b0 = ptr_b; + ptr_b1 = ptr_b0 + n_blksize * 3; + for (; m_count > 15; m_count -= 16) { + DECLARE_A_PAIR(0); DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b0 += 24 * 2; + ptr_b1 += 24 * 2; + ptr_a0 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); + ptr_b0 += 24; + ptr_b1 += 24; + ptr_a0 += 16; + } + } + } +} diff --git a/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c b/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c deleted file mode 100644 index ea2600067..000000000 --- a/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c +++ /dev/null @@ -1,32 +0,0 @@ -/*************************************************************************** -Copyright (c) 2021, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) -{ -} diff --git a/kernel/x86_64/sbgemm_ncopy_32_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c similarity index 100% rename from kernel/x86_64/sbgemm_ncopy_32_cooperlake.c rename to kernel/x86_64/sbgemm_ncopy_16_cooperlake.c diff --git a/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c new file mode 100644 index 000000000..523e3b48f --- /dev/null +++ b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c @@ -0,0 +1,207 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#define REORDER_4x32(r0, r1, r2, r3) {\ + __m512i t0, t1, t2, t3; \ + t0 = _mm512_unpacklo_epi32(r0, r1); \ + t1 = _mm512_unpackhi_epi32(r0, r1); \ + t2 = _mm512_unpacklo_epi32(r2, r3); \ + t3 = _mm512_unpackhi_epi32(r2, r3); \ + r0 = _mm512_unpacklo_epi64(t0, t2); \ + r1 = _mm512_unpackhi_epi64(t0, t2); \ + r2 = _mm512_unpacklo_epi64(t1, t3); \ + r3 = _mm512_unpackhi_epi64(t1, t3); \ + t0 = _mm512_permutex2var_epi32(r0, idx_lo_128, r1); \ + t1 = _mm512_permutex2var_epi32(r0, idx_hi_128, r1); \ + t2 = _mm512_permutex2var_epi32(r2, idx_lo_128, r3); \ + t3 = _mm512_permutex2var_epi32(r2, idx_hi_128, r3); \ + r0 = _mm512_permutex2var_epi32(t0, idx_lo_256, t2); \ + r1 = _mm512_permutex2var_epi32(t1, idx_lo_256, t3); \ + r2 = _mm512_permutex2var_epi32(t0, idx_hi_256, t2); \ + r3 = _mm512_permutex2var_epi32(t1, idx_hi_256, t3); \ +} + +#define REORDER_4x8(r0, r1, r2, r3) {\ + __m128i t0, t1, t2, t3; \ + t0 = _mm_unpacklo_epi32(r0, r1); \ + t1 = _mm_unpackhi_epi32(r0, r1); \ + t2 = _mm_unpacklo_epi32(r2, r3); \ + t3 = _mm_unpackhi_epi32(r2, r3); \ + r0 = _mm_unpacklo_epi64(t0, t2); \ + r1 = _mm_unpackhi_epi64(t0, t2); \ + r2 = _mm_unpacklo_epi64(t1, t3); \ + r3 = _mm_unpackhi_epi64(t1, t3); \ +} + +#define GET_TAIL(tail, remain_m) \ + switch((remain_m + 1)/2) { \ + case 1: tail = r0; break; \ + case 2: tail = r1; break; \ + case 3: tail = r2; break; \ + case 4: tail = r3; break; \ + } + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + IFLOAT *aoffset; + IFLOAT *aoffset0, *aoffset1, *aoffset2, *aoffset3; + + IFLOAT *boffset; + + aoffset = a; + boffset = b; + + BLASLONG m32 = n & ~31; + BLASLONG m8 = n & ~7; + BLASLONG n4 = n & ~3; + + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + }; + __m512i idx_lo_128 = _mm512_loadu_si512(permute_table); + __m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16); + __m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32); + __m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48); + + for (j = 0; j < n4; j += 4) { + aoffset0 = aoffset; + aoffset1 = aoffset0 + lda; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset += 4 * lda; + + for (i = 0; i < m32; i += 32) { + __m512i r0, r1, r2, r3; + r0 = _mm512_loadu_si512(aoffset0 + i); + r1 = _mm512_loadu_si512(aoffset1 + i); + r2 = _mm512_loadu_si512(aoffset2 + i); + r3 = _mm512_loadu_si512(aoffset3 + i); + REORDER_4x32(r0, r1, r2, r3); + _mm512_storeu_si512(boffset + 32*0, r0); + _mm512_storeu_si512(boffset + 32*1, r1); + _mm512_storeu_si512(boffset + 32*2, r2); + _mm512_storeu_si512(boffset + 32*3, r3); + boffset += 32 * 4; + } + for (; i < m8; i += 8) { + __m128i r0 = _mm_loadu_si128(aoffset0 + i); + __m128i r1 = _mm_loadu_si128(aoffset1 + i); + __m128i r2 = _mm_loadu_si128(aoffset2 + i); + __m128i r3 = _mm_loadu_si128(aoffset3 + i); + REORDER_4x8(r0, r1, r2, r3); + _mm_storeu_si128(boffset + 8*0, r0); + _mm_storeu_si128(boffset + 8*1, r1); + _mm_storeu_si128(boffset + 8*2, r2); + _mm_storeu_si128(boffset + 8*3, r3); + boffset += 8 * 4; + } + if (i < m) { + int remain_m = m - i; + __mmask8 r_mask = (1UL << remain_m) - 1; + __m128i r0 = _mm_maskz_loadu_epi16(r_mask, aoffset0 + i); + __m128i r1 = _mm_maskz_loadu_epi16(r_mask, aoffset1 + i); + __m128i r2 = _mm_maskz_loadu_epi16(r_mask, aoffset2 + i); + __m128i r3 = _mm_maskz_loadu_epi16(r_mask, aoffset3 + i); + REORDER_4x8(r0, r1, r2, r3); + + // store should skip the tail odd line + int num_store = remain_m/2; + switch(num_store) { + case 3: _mm_storeu_si128(boffset + 8*2, r0); + case 2: _mm_storeu_si128(boffset + 8*1, r0); + case 1: _mm_storeu_si128(boffset + 8*0, r0); + } + boffset += 8 * num_store; + + if (m & 0x1) { // handling the tail + __m128i tail; + GET_TAIL(tail, remain_m); + /* tail vector is fill with zero like: + * a, 0, b, 0, c, 0, d, 0 + * need to extract lo words of data and store + */ + tail = _mm_cvtepi32_epi16(tail); + _mm_store_sd(boffset, (__m128d) tail); // only lower 4 bfloat valid + boffset += 4; + } + } + } + if (j < n) { + int remain_n = n - j; + __mmask8 nmask = (1UL << remain_n) - 1; + aoffset0 = aoffset; + aoffset1 = aoffset0 + lda; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + __m128i r0, r1, r2, r3; + for (i = 0; i < m8; i += 8) { + switch (remain_n) { + case 3: r2 = _mm_loadu_si128(aoffset2 + i); + case 2: r1 = _mm_loadu_si128(aoffset1 + i); + case 1: r0 = _mm_loadu_si128(aoffset0 + i); + } + REORDER_4x8(r0, r1, r2, r3); + _mm_mask_storeu_epi16(boffset + remain_n * 0, nmask, r0); + _mm_mask_storeu_epi16(boffset + remain_n * 1, nmask, r1); + _mm_mask_storeu_epi16(boffset + remain_n * 2, nmask, r2); + _mm_mask_storeu_epi16(boffset + remain_n * 3, nmask, r3); + boffset += 4 * remain_n; + } + if (i < m) { + int remain_m = m - i; + __mmask8 mmask = (1UL << remain_m) - 1; + switch (remain_n) { + case 3: r2 = _mm_maskz_loadu_epi16(mmask, aoffset2 + i); + case 2: r1 = _mm_maskz_loadu_epi16(mmask, aoffset1 + i); + case 1: r0 = _mm_maskz_loadu_epi16(mmask, aoffset0 + i); + } + REORDER_4x8(r0, r1, r2, r3); + + int num_store = remain_m/2; + switch (num_store) { + case 3: _mm_mask_storeu_epi16(boffset + remain_n * 2, nmask, r2); + case 2: _mm_mask_storeu_epi16(boffset + remain_n * 1, nmask, r1); + case 1: _mm_mask_storeu_epi16(boffset + remain_n * 0, nmask, r0); + } + boffset += 2 * num_store * remain_n; + + if (m & 0x1) { + __m128i tail; + GET_TAIL(tail, remain_m); + tail = _mm_cvtepi32_epi16(tail); + _mm_mask_storeu_epi16(boffset, nmask, tail); + } + } + } +} diff --git a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c similarity index 71% rename from kernel/x86_64/sbgemm_tcopy_32_cooperlake.c rename to kernel/x86_64/sbgemm_tcopy_16_cooperlake.c index 3e37473ca..16bf48f0b 100644 --- a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c +++ b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c @@ -32,23 +32,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - IFLOAT *boffset; + IFLOAT *boffset0, *boffset1; - boffset = b; + boffset0 = b; BLASLONG n32 = n & ~31; BLASLONG m4 = m & ~3; BLASLONG m2 = m & ~1; uint32_t permute_table = { - 0, 0x10|0, 1, 0x10|1, 2, 0x10|2, 3, 0x10|3, 4, 0x10|4, 5, 0x10|5, 6, 0x10|6, 7, 0x10, 7, - 8, 0x10|8, 9, 0x10|9, 10, 0x10|10, 11, 0x10|11, 12, 0x10|12, 13, 0x10|13, 14, 0x10|14, 15, 0x10|15, + 0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17, + 0x08, 0x18, 0x09, 0x19, 0x0a, 0x1a, 0x0b, 0x1b, 0x0c, 0x1c, 0x0d, 0x1d, 0x0e, 0x1e, 0x0f, 0x1f, }; __m512i idx_lo = _mm512_loadu_si512(permute_table); __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); for (j = 0; j < n32; j += 32) { + /* process 2x16 n at the same time */ + boffset1 = boffset0 + m * 16; for (i = 0; i < m4; i += 4) { /* bf16 fma need special memory layout: * for memory layout like below: @@ -72,11 +74,12 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11); a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11); - _mm512_storeu_si512(boffset, a0); - _mm512_storeu_si512(boffset + 32, a1); - _mm512_storeu_si512(boffset + 64, a2); - _mm512_storeu_si512(boffset + 96, a3); - boffset += 128; + _mm512_storeu_si512(boffset0, a0); + _mm512_storeu_si512(boffset1, a1); + _mm512_storeu_si512(boffset0 + 32, a2); + _mm512_storeu_si512(boffset1 + 32, a3); + boffset0 += 64; + boffset1 += 64; } for (; i < m2; i += 2) { __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); @@ -88,22 +91,29 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); - _mm512_storeu_si512(boffset, a0); - _mm512_storeu_si512(boffset + 32, a1); - boffset += 64; + _mm512_storeu_si512(boffset0, a0); + _mm512_storeu_si512(boffset1, a1); + boffset0 += 32; + boffset1 += 32; } for (; i < m; i++) { /* just copy the only remains row */ - __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); - _mm512_storeu_si512(boffset, a0); - boffset += 32; + __m256i a0 = _mm256_loadu_si256(&a[(i + 0)*lda + j]); + __m256i a1 = _mm256_loadu_si256(&a[(i + 0)*lda + j + 16]); + _mm256_storeu_si256(boffset0, a0); + _mm256_storeu_si256(boffset1, a1); + boffset0 += 16; + boffset1 += 16; } + boffset0 = boffset1; } if (j < n) { uint32_t remains = n - j; __mmask32 r_mask = (1UL << remains) - 1; if (remains > 16) { - __mmask16 w_mask = (1UL << (remains - 16)) - 1; + boffset1 = boffset0 + m * 16; + uint32_t tail1 = remains - 16; + __mmask16 w_mask1 = (1UL << tail1) - 1; for (i = 0; i < m2; i += 2) { __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); @@ -114,9 +124,19 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); - _mm512_storeu_si512(boffset, a0); - _mm512_mask_storeu_epi32(boffset + 32, w_mask, a1); - boffset += 2 * remains; + _mm512_storeu_si512(boffset0, a0); + _mm512_mask_storeu_epi32(boffset1, w_mask1, a1); + + boffset0 += 32; + boffset1 += 2 * tail1; + } + for (; i < m; i++) { + __m256i a0 = _mm256_loadu_si256(&a[(i + 0)*lda + j]); + __m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, &a[(i + 0)*lda + j + 16]); + _mm256_storeu_si256(boffset0, a0); + _mm256_mask_storeu_epi16(boffset1, w_mask1, a1); + boffset0 += 16; + boffset1 += tail1; } } else { __mmask16 w_mask = (1UL << remains ) - 1; @@ -128,14 +148,15 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ __m512i a01 = _mm512_unpackhi_epi16(a0, a1); a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); - _mm512_mask_storeu_epi32(boffset, w_mask, a0); - boffset += 2 * remains; + + _mm512_mask_storeu_epi32(boffset0, w_mask, a0); + boffset0 += 2 * remains; + } + for (; i < m; i++) { + __m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]); + _mm256_mask_storeu_epi16(boffset0, w_mask, a0); + boffset0 += remains; } - } - for (; i < m; i++) { - __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); - _mm512_mask_storeu_epi16(boffset, r_mask, a0); - boffset += remains; } } } diff --git a/kernel/x86_64/sbgemm_ncopy_8_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c similarity index 100% rename from kernel/x86_64/sbgemm_ncopy_8_cooperlake.c rename to kernel/x86_64/sbgemm_tcopy_4_cooperlake.c diff --git a/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c deleted file mode 100644 index afcf6f647..000000000 --- a/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c +++ /dev/null @@ -1,33 +0,0 @@ -/*************************************************************************** -Copyright (c) 2021, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include -#include "common.h" - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - -} From 9df0953cde0833644155eb6f22d241fc773504a8 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Mon, 16 Aug 2021 19:39:24 +0800 Subject: [PATCH 095/143] sbgemm: cooperlake: kernel works for NN --- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 375 +++++++++++++++++- kernel/x86_64/sbgemm_ncopy_4_cooperlake.c | 51 +-- kernel/x86_64/sbgemm_tcopy_16_cooperlake.c | 259 ++++++------ 3 files changed, 515 insertions(+), 170 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index 05ba015d2..d604235c9 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -31,8 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMOVLDUP(addr, zmm) asm("vmovsldup (%1), %0": "=v"(zmm): "r"(addr)) #define VMOVHDUP(addr, zmm) asm("vmovshdup (%1), %0": "=v"(zmm): "r"(addr)) #define BROADCAST64(base, step, n, offset, zmm) \ - if (n == 0) asm("vbroadcastsd %2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \ - else asm("vbroadcastsd %4(%1, %2, %3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2)) + if (n == 0) asm("vbroadcastsd %c2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \ + else asm("vbroadcastsd %c4(%1, %2, %c3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2)) #define DECLARE_A_PAIR(A) \ __m512i A_lo_##A; __m512i A_hi_##A; @@ -41,8 +41,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VMOVLDUP(ptr_a##A, A_lo_##A); \ VMOVHDUP(ptr_a##A, A_hi_##A); +#define MASK_LOAD_A_PAIR(A) { \ + __m512 tmp = _mm512_maskz_loadu_ps(mmask, ptr_a##A); \ + A_lo_##A = (__m512i) _mm512_moveldup_ps(tmp); \ + A_hi_##A = (__m512i) _mm512_movehdup_ps(tmp); \ +} + #define LOAD_A_PAIR_TAIL(A) { \ - __m256i ymm = _mm256_loadu_si256(ptr_a##A); \ + __m256i ymm = _mm256_loadu_si256((void *)ptr_a##A); \ + __m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \ + A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \ + A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \ +} + +#define MASK_LOAD_A_PAIR_TAIL(A) { \ + __m256i ymm = _mm256_maskz_loadu_epi16(mmask, ptr_a##A); \ __m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \ A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \ A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \ @@ -53,13 +66,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BROADCAST_B_PAIR(Bx, By) \ BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \ - BROADCAST64(ptr_b##Bx, n_blksize, By, 2, B_hi); + BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi); + +#define MASK_BROADCAST_B_PAIR(Bx, x) {\ + __m128 xmm = _mm_maskz_loadu_ps(nmask, ptr_b##Bx); \ + B_lo = (__m512i) _mm512_broadcastsd_pd((__m128d) xmm); \ + B_hi = (__m512i) _mm512_broadcastsd_pd(_mm_permute_pd((__m128d) xmm, 0x1)); \ +} #define BROADCAST_B_PAIR_TAIL(Bx, By) {\ - __m128i xmm = (__m128i) _mm_load_sd(ptr_b##Bx + n_blksize * By); \ + __m128i xmm = (__m128i) _mm_load_sd((double *)(ptr_b##Bx + n_blksize * By)); \ + xmm = _mm_cvtepu16_epi32(xmm); \ + B_lo = _mm512_broadcast_i32x2(xmm); \ + B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ +} + +#define MASK_BROADCAST_B_PAIR_TAIL(Bx, By) {\ + __m128i xmm = _mm_maskz_loadu_epi16(nmask, ptr_b##Bx + n_blksize * By); \ xmm = _mm_cvtepu16_epi32(xmm); \ - B_lo = _mm512_broadcastd_epi32(xmm); \ - B_hi = _mm512_broadcastd_epi32((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ + B_lo = _mm512_broadcast_i32x2(xmm); \ + B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ } #define DECLARE_RESULT_4X(A, Bx, By) \ @@ -76,25 +102,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMA(A_lo_##A, B_hi, result_10_##A##Bx##By); \ FMA(A_hi_##A, B_hi, result_11_##A##Bx##By); -#define STORE_4X(A, Bx, By) +#define _STORE_C_2nx16(addr, val0, val1) \ + asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \ + asm("vfmadd213ps (%1, %3, 4), %2, %0": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc)); \ + asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); \ + asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc)) + +#define _MASK_STORE_C_2nx16(addr, val0, val1) \ + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ + asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "k"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); \ + asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "k"(mmask)) + +#define _REORDER_C_2X(result_0, result_1) { \ + __m512 tmp0, tmp1; \ + tmp0 = _mm512_unpacklo_ps(result_0, result_1); \ + tmp1 = _mm512_unpackhi_ps(result_0, result_1); \ + result_0 = (__m512) _mm512_unpacklo_pd((__m512d) tmp0, (__m512d) tmp1); \ + result_1 = (__m512) _mm512_unpackhi_pd((__m512d) tmp0, (__m512d) tmp1); \ +} + +#define _STORE_2X(ptr_c, result_0, result_1) {\ + _REORDER_C_2X(result_0, result_1) \ + _STORE_C_2nx16(ptr_c, result_0, result_1); \ + ptr_c += ldc * 2; \ +} + +#define _MASK_STORE_2X(ptr_c, result_0, result_1) {\ + _REORDER_C_2X(result_0, result_1) \ + _MASK_STORE_C_2nx16(ptr_c, result_0, result_1); \ + ptr_c += ldc * 2; \ +} + +#define STORE_4X(A, Bx, By) { \ + _STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \ +} + +#define MASK_STORE_4X(A, Bx, By) { \ + _MASK_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _MASK_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \ +} + +#define _STORE_C_16(addr, val0) \ + asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \ + asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); +#define _MASK_STORE_C_16(addr, val0) \ + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); + +#define N_STORE_4X(A, Bx, By) { \ + _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \ + switch(n_count) { \ + case 3: _STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \ + case 2: _STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \ + case 1: _STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \ + } \ + ptr_c##A += ldc * n_count; \ +} + +#define N_MASK_STORE_4X(A, Bx, By) { \ + _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \ + switch(n_count) { \ + case 3: _MASK_STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \ + case 2: _MASK_STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \ + case 1: _MASK_STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \ + } \ + ptr_c##A += ldc * n_count; \ +} int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) { - IFLOAT *ptr_a = A, *ptr_b = B, *ptr_c = C; + IFLOAT *ptr_a = A, *ptr_b = B; IFLOAT *ptr_b0, *ptr_b1; IFLOAT *ptr_a0, *ptr_a1; + FLOAT *ptr_c = C; + FLOAT *ptr_c0, *ptr_c1; BLASLONG n_count = n; BLASLONG m_count, k_count; BLASLONG n_blksize = 4 * k; + BLASLONG cn_offset = 0; + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); for (; n_count > 23; n_count -= 24) { + IFLOAT *ptr_b00 = ptr_b; + IFLOAT *ptr_b10 = ptr_b + n_blksize * 3; + ptr_a0 = ptr_a; + ptr_c = C + cn_offset * ldc; m_count = m; - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + n_blksize * 3; for (; m_count > 15; m_count -= 16) { - DECLARE_A_PAIR(0); DECLARE_B_PAIR(); + ptr_b0 = ptr_b00; + ptr_b1 = ptr_b10; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); for (k_count = k; k_count > 1; k_count -=2) { @@ -105,8 +209,8 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); - ptr_b0 += 24 * 2; - ptr_b1 += 24 * 2; + ptr_b0 += 4 * 2; + ptr_b1 += 4 * 2; ptr_a0 += 16 * 2; } if (k_count > 0) { @@ -117,10 +221,249 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); - ptr_b0 += 24; - ptr_b1 += 24; + ptr_b0 += 4; + ptr_b1 += 4; ptr_a0 += 16; } + ptr_c0 = ptr_c; + STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); + STORE_4X(0, 1, 0); STORE_4X(0, 1, 1); STORE_4X(0, 1, 2); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + ptr_b1 = ptr_b10; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b0 += 4 * 2; + ptr_b1 += 4 * 2; + ptr_a0 += m_count * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); + ptr_b0 += 4; + ptr_b1 += 4; + ptr_a0 += m_count; + } + ptr_c0 = ptr_c; + MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); + MASK_STORE_4X(0, 1, 0); MASK_STORE_4X(0, 1, 1); MASK_STORE_4X(0, 1, 2); + ptr_c += m_count; + } + ptr_b += 24 * k; + cn_offset += 24; + } + for (; n_count > 11; n_count -= 12) { + IFLOAT *ptr_b00 = ptr_b; + ptr_a0 = ptr_a; + ptr_a1 = ptr_a + 16 * k; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 31; m_count -= 32) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); DECLARE_A_PAIR(1); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + DECLARE_RESULT_4X(1, 0, 0); DECLARE_RESULT_4X(1, 0, 1); DECLARE_RESULT_4X(1, 0, 2); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); LOAD_A_PAIR(1); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); + ptr_b0 += 4 * 2; + ptr_a0 += 16 * 2; + ptr_a1 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); LOAD_A_PAIR_TAIL(1); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); + ptr_b0 += 4; + ptr_a0 += 16; + ptr_a1 += 16; + } + ptr_c0 = ptr_c; + ptr_c1 = ptr_c + 16; + STORE_4X(0, 0, 0); STORE_4X(1, 0, 0); + STORE_4X(0, 0, 1); STORE_4X(1, 0, 1); + STORE_4X(0, 0, 2); STORE_4X(1, 0, 2); + ptr_c += 16 * 2; + } + if (m > 31) { + ptr_a0 = ptr_a1; + } + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + ptr_a0 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; + ptr_a0 += 16; + } + ptr_c0 = ptr_c; + STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + ptr_a0 += m_count * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; + ptr_a0 += m_count; + } + ptr_c0 = ptr_c; + MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); + ptr_c += m_count; + } + ptr_b += 12 * k; + cn_offset += 12; + } + for (; n_count > 3; n_count -= 4) { + IFLOAT *ptr_b00 = ptr_b; + ptr_a0 = ptr_a; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4 * 2; + ptr_a0 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4; + ptr_a0 += 16; + } + ptr_c0 = ptr_c; + STORE_4X(0, 0, 0); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4 * 2; + ptr_a0 += m_count * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4; + ptr_a0 += m_count; + } + ptr_c0 = ptr_c; + MASK_STORE_4X(0, 0, 0); + ptr_c += m_count; + } + ptr_b += 4 * k; + cn_offset += 4; + } + if (n_count > 0) { + __mmask8 nmask = (1UL << n_count) - 1; + IFLOAT *ptr_b00 = ptr_b; + ptr_a0 = ptr_a; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count * 2; + ptr_a0 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count; + ptr_a0 += 16; + } + ptr_c0 = ptr_c; + N_STORE_4X(0, 0, 0); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count * 2; + ptr_a0 += m_count * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count; + ptr_a0 += m_count; + } + ptr_c0 = ptr_c; + N_MASK_STORE_4X(0, 0, 0); + ptr_c += m_count; } } + return 0; } diff --git a/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c index 523e3b48f..eefbd7355 100644 --- a/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c +++ b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c @@ -79,8 +79,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset = a; boffset = b; - BLASLONG m32 = n & ~31; - BLASLONG m8 = n & ~7; + BLASLONG m32 = m & ~31; + BLASLONG m8 = m & ~7; BLASLONG n4 = n & ~3; int permute_table[] = { @@ -115,15 +115,15 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset += 32 * 4; } for (; i < m8; i += 8) { - __m128i r0 = _mm_loadu_si128(aoffset0 + i); - __m128i r1 = _mm_loadu_si128(aoffset1 + i); - __m128i r2 = _mm_loadu_si128(aoffset2 + i); - __m128i r3 = _mm_loadu_si128(aoffset3 + i); + __m128i r0 = _mm_loadu_si128((void *)(aoffset0 + i)); + __m128i r1 = _mm_loadu_si128((void *)(aoffset1 + i)); + __m128i r2 = _mm_loadu_si128((void *)(aoffset2 + i)); + __m128i r3 = _mm_loadu_si128((void *)(aoffset3 + i)); REORDER_4x8(r0, r1, r2, r3); - _mm_storeu_si128(boffset + 8*0, r0); - _mm_storeu_si128(boffset + 8*1, r1); - _mm_storeu_si128(boffset + 8*2, r2); - _mm_storeu_si128(boffset + 8*3, r3); + _mm_storeu_si128((void *)(boffset + 8*0), r0); + _mm_storeu_si128((void *)(boffset + 8*1), r1); + _mm_storeu_si128((void *)(boffset + 8*2), r2); + _mm_storeu_si128((void *)(boffset + 8*3), r3); boffset += 8 * 4; } if (i < m) { @@ -138,9 +138,9 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ // store should skip the tail odd line int num_store = remain_m/2; switch(num_store) { - case 3: _mm_storeu_si128(boffset + 8*2, r0); - case 2: _mm_storeu_si128(boffset + 8*1, r0); - case 1: _mm_storeu_si128(boffset + 8*0, r0); + case 3: _mm_storeu_si128((void *)(boffset + 8*2), r2); + case 2: _mm_storeu_si128((void *)(boffset + 8*1), r1); + case 1: _mm_storeu_si128((void *)(boffset + 8*0), r0); } boffset += 8 * num_store; @@ -152,7 +152,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ * need to extract lo words of data and store */ tail = _mm_cvtepi32_epi16(tail); - _mm_store_sd(boffset, (__m128d) tail); // only lower 4 bfloat valid + _mm_store_sd((double *)boffset, (__m128d) tail); // only lower 4 bfloat valid boffset += 4; } } @@ -167,16 +167,16 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ __m128i r0, r1, r2, r3; for (i = 0; i < m8; i += 8) { switch (remain_n) { - case 3: r2 = _mm_loadu_si128(aoffset2 + i); - case 2: r1 = _mm_loadu_si128(aoffset1 + i); - case 1: r0 = _mm_loadu_si128(aoffset0 + i); + case 3: r2 = _mm_loadu_si128((void *)(aoffset2 + i)); + case 2: r1 = _mm_loadu_si128((void *)(aoffset1 + i)); + case 1: r0 = _mm_loadu_si128((void *)(aoffset0 + i)); } REORDER_4x8(r0, r1, r2, r3); - _mm_mask_storeu_epi16(boffset + remain_n * 0, nmask, r0); - _mm_mask_storeu_epi16(boffset + remain_n * 1, nmask, r1); - _mm_mask_storeu_epi16(boffset + remain_n * 2, nmask, r2); - _mm_mask_storeu_epi16(boffset + remain_n * 3, nmask, r3); - boffset += 4 * remain_n; + _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0); + _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1); + _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2); + _mm_mask_storeu_epi32(boffset + remain_n * 6, nmask, r3); + boffset += 8 * remain_n; } if (i < m) { int remain_m = m - i; @@ -190,9 +190,9 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ int num_store = remain_m/2; switch (num_store) { - case 3: _mm_mask_storeu_epi16(boffset + remain_n * 2, nmask, r2); - case 2: _mm_mask_storeu_epi16(boffset + remain_n * 1, nmask, r1); - case 1: _mm_mask_storeu_epi16(boffset + remain_n * 0, nmask, r0); + case 3: _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2); + case 2: _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1); + case 1: _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0); } boffset += 2 * num_store * remain_n; @@ -204,4 +204,5 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ } } } + return 0; } diff --git a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c index 16bf48f0b..ce4458d2c 100644 --- a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c +++ b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c @@ -29,134 +29,135 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" + int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - BLASLONG i, j; - - IFLOAT *boffset0, *boffset1; - - boffset0 = b; - - BLASLONG n32 = n & ~31; - BLASLONG m4 = m & ~3; - BLASLONG m2 = m & ~1; - - uint32_t permute_table = { - 0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17, - 0x08, 0x18, 0x09, 0x19, 0x0a, 0x1a, 0x0b, 0x1b, 0x0c, 0x1c, 0x0d, 0x1d, 0x0e, 0x1e, 0x0f, 0x1f, - }; - - __m512i idx_lo = _mm512_loadu_si512(permute_table); - __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); - - for (j = 0; j < n32; j += 32) { - /* process 2x16 n at the same time */ - boffset1 = boffset0 + m * 16; - for (i = 0; i < m4; i += 4) { - /* bf16 fma need special memory layout: - * for memory layout like below: - * a00, a01, a02, a03, a04, a05 .... - * a10, a11, a12, a13, a14, a15 .... - * need to copy as: - * a00, a10, a01, a11, a02, a12, a03, a13, ... - */ - __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); - __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); - __m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]); - __m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]); - - __m512i a00 = _mm512_unpacklo_epi16(a0, a1); - __m512i a01 = _mm512_unpackhi_epi16(a0, a1); - __m512i a10 = _mm512_unpacklo_epi16(a2, a3); - __m512i a11 = _mm512_unpackhi_epi16(a2, a3); - - a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); - a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); - a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11); - a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11); - - _mm512_storeu_si512(boffset0, a0); - _mm512_storeu_si512(boffset1, a1); - _mm512_storeu_si512(boffset0 + 32, a2); - _mm512_storeu_si512(boffset1 + 32, a3); - boffset0 += 64; - boffset1 += 64; - } - for (; i < m2; i += 2) { - __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); - __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); - - __m512i a00 = _mm512_unpacklo_epi16(a0, a1); - __m512i a01 = _mm512_unpackhi_epi16(a0, a1); - - a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); - a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); - - _mm512_storeu_si512(boffset0, a0); - _mm512_storeu_si512(boffset1, a1); - boffset0 += 32; - boffset1 += 32; - } - for (; i < m; i++) { - /* just copy the only remains row */ - __m256i a0 = _mm256_loadu_si256(&a[(i + 0)*lda + j]); - __m256i a1 = _mm256_loadu_si256(&a[(i + 0)*lda + j + 16]); - _mm256_storeu_si256(boffset0, a0); - _mm256_storeu_si256(boffset1, a1); - boffset0 += 16; - boffset1 += 16; - } - boffset0 = boffset1; - } - if (j < n) { - uint32_t remains = n - j; - __mmask32 r_mask = (1UL << remains) - 1; - if (remains > 16) { - boffset1 = boffset0 + m * 16; - uint32_t tail1 = remains - 16; - __mmask16 w_mask1 = (1UL << tail1) - 1; - for (i = 0; i < m2; i += 2) { - __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); - __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); - - __m512i a00 = _mm512_unpacklo_epi16(a0, a1); - __m512i a01 = _mm512_unpackhi_epi16(a0, a1); - - a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); - a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); - - _mm512_storeu_si512(boffset0, a0); - _mm512_mask_storeu_epi32(boffset1, w_mask1, a1); - - boffset0 += 32; - boffset1 += 2 * tail1; - } - for (; i < m; i++) { - __m256i a0 = _mm256_loadu_si256(&a[(i + 0)*lda + j]); - __m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, &a[(i + 0)*lda + j + 16]); - _mm256_storeu_si256(boffset0, a0); - _mm256_mask_storeu_epi16(boffset1, w_mask1, a1); - boffset0 += 16; - boffset1 += tail1; - } - } else { - __mmask16 w_mask = (1UL << remains ) - 1; - for (i = 0; i < m2; i += 2) { - __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); - __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); - - __m512i a00 = _mm512_unpacklo_epi16(a0, a1); - __m512i a01 = _mm512_unpackhi_epi16(a0, a1); - - a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); - - _mm512_mask_storeu_epi32(boffset0, w_mask, a0); - boffset0 += 2 * remains; - } - for (; i < m; i++) { - __m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]); - _mm256_mask_storeu_epi16(boffset0, w_mask, a0); - boffset0 += remains; - } - } - } + BLASLONG i, j; + + IFLOAT *boffset0, *boffset1; + + boffset0 = b; + + BLASLONG n32 = n & ~31; + BLASLONG m4 = m & ~3; + BLASLONG m2 = m & ~1; + + uint32_t permute_table[] = { + 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, + 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f, + }; + + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); + + for (j = 0; j < n32; j += 32) { + /* process 2x16 n at the same time */ + boffset1 = boffset0 + m * 16; + for (i = 0; i < m4; i += 4) { + /* bf16 fma need special memory layout: + * for memory layout like below: + * a00, a01, a02, a03, a04, a05 .... + * a10, a11, a12, a13, a14, a15 .... + * need to copy as: + * a00, a10, a01, a11, a02, a12, a03, a13, ... + */ + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); + __m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]); + __m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + __m512i a10 = _mm512_unpacklo_epi16(a2, a3); + __m512i a11 = _mm512_unpackhi_epi16(a2, a3); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11); + a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11); + + _mm512_storeu_si512(boffset0, a0); + _mm512_storeu_si512(boffset1, a1); + _mm512_storeu_si512(boffset0 + 32, a2); + _mm512_storeu_si512(boffset1 + 32, a3); + boffset0 += 64; + boffset1 += 64; + } + for (; i < m2; i += 2) { + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + + _mm512_storeu_si512(boffset0, a0); + _mm512_storeu_si512(boffset1, a1); + boffset0 += 32; + boffset1 += 32; + } + for (; i < m; i++) { + /* just copy the only remains row */ + __m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]); + __m256i a1 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j + 16]); + _mm256_storeu_si256((void *)boffset0, a0); + _mm256_storeu_si256((void *)boffset1, a1); + boffset0 += 16; + boffset1 += 16; + } + boffset0 = boffset1; + } + if (j < n) { + uint32_t remains = n - j; + __mmask32 r_mask = (1UL << remains) - 1; + if (remains > 16) { + boffset1 = boffset0 + m * 16; + uint32_t tail1 = remains - 16; + __mmask16 w_mask1 = (1UL << tail1) - 1; + for (i = 0; i < m2; i += 2) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + + _mm512_storeu_si512(boffset0, a0); + _mm512_mask_storeu_epi32(boffset1, w_mask1, a1); + + boffset0 += 32; + boffset1 += 2 * tail1; + } + for (; i < m; i++) { + __m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]); + __m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, (void *)&a[(i + 0)*lda + j + 16]); + _mm256_storeu_si256((void *)boffset0, a0); + _mm256_mask_storeu_epi16((void *)boffset1, w_mask1, a1); + boffset0 += 16; + boffset1 += tail1; + } + } else { + __mmask16 w_mask = (1UL << remains ) - 1; + for (i = 0; i < m2; i += 2) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + + _mm512_mask_storeu_epi32(boffset0, w_mask, a0); + boffset0 += 2 * remains; + } + for (; i < m; i++) { + __m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]); + _mm256_mask_storeu_epi16(boffset0, w_mask, a0); + boffset0 += remains; + } + } + } } From 8356a604f0bab4844827a1b622aa5c481157bd4b Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 17 Aug 2021 19:35:40 +0800 Subject: [PATCH 096/143] sbgemm: cooperlake: tuning for block params --- driver/others/parameter.c | 1 + param.h | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 36da13369..d7dbddc7c 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -524,6 +524,7 @@ void blas_set_parameter(void){ xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; #endif + sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; diff --git a/param.h b/param.h index 07397a66e..48770fa7a 100644 --- a/param.h +++ b/param.h @@ -1771,6 +1771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define USE_SGEMM_KERNEL_DIRECT 1 +#undef SBGEMM_DEFAULT_UNROLL_N +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_P +#undef SBGEMM_DEFAULT_R +#undef SBGEMM_DEFAULT_Q +#define SBGEMM_DEFAULT_UNROLL_N 4 +#define SBGEMM_DEFAULT_UNROLL_M 16 +#define SBGEMM_DEFAULT_P 384 +#define SBGEMM_DEFAULT_Q 768 +#define SBGEMM_DEFAULT_R sbgemm_r + #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 From cece3541ab739f94add22fda840276033d0feb97 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 17 Aug 2021 21:13:29 +0800 Subject: [PATCH 097/143] sbgemm: cooperlake: fix bug in m64n12 --- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index d604235c9..c257a3f60 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -306,9 +306,8 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * STORE_4X(0, 0, 1); STORE_4X(1, 0, 1); STORE_4X(0, 0, 2); STORE_4X(1, 0, 2); ptr_c += 16 * 2; - } - if (m > 31) { ptr_a0 = ptr_a1; + ptr_a1 = ptr_a0 + 16 * k; } for (; m_count > 15; m_count -= 16) { ptr_b0 = ptr_b00; From 45fdf951b64aa9145996727ecda901f00a2eda3c Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 17 Aug 2021 22:08:24 +0800 Subject: [PATCH 098/143] sbgemm: cooperlake: reorder ptr increase for performance --- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index c257a3f60..4c1f50650 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -203,27 +203,27 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); for (k_count = k; k_count > 1; k_count -=2) { LOAD_A_PAIR(0); + ptr_a0 += 16 * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); - ptr_b0 += 4 * 2; ptr_b1 += 4 * 2; - ptr_a0 += 16 * 2; } if (k_count > 0) { LOAD_A_PAIR_TAIL(0); + ptr_a0 += 16; BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); - ptr_b0 += 4; ptr_b1 += 4; - ptr_a0 += 16; } ptr_c0 = ptr_c; STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); @@ -240,27 +240,27 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); for (k_count = k; k_count > 1; k_count -=2) { MASK_LOAD_A_PAIR(0); + ptr_a0 += m_count * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); - ptr_b0 += 4 * 2; ptr_b1 += 4 * 2; - ptr_a0 += m_count * 2; } if (k_count > 0) { MASK_LOAD_A_PAIR_TAIL(0); + ptr_a0 += m_count; BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); - ptr_b0 += 4; ptr_b1 += 4; - ptr_a0 += m_count; } ptr_c0 = ptr_c; MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); @@ -284,21 +284,21 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * DECLARE_RESULT_4X(1, 0, 0); DECLARE_RESULT_4X(1, 0, 1); DECLARE_RESULT_4X(1, 0, 2); for (k_count = k; k_count > 1; k_count -=2) { LOAD_A_PAIR(0); LOAD_A_PAIR(1); + ptr_a0 += 16 * 2; + ptr_a1 += 16 * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); ptr_b0 += 4 * 2; - ptr_a0 += 16 * 2; - ptr_a1 += 16 * 2; } if (k_count > 0) { LOAD_A_PAIR_TAIL(0); LOAD_A_PAIR_TAIL(1); + ptr_a0 += 16; + ptr_a1 += 16; BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); ptr_b0 += 4; - ptr_a0 += 16; - ptr_a1 += 16; } ptr_c0 = ptr_c; ptr_c1 = ptr_c + 16; @@ -316,19 +316,19 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); for (k_count = k; k_count > 1; k_count -=2) { LOAD_A_PAIR(0); + ptr_a0 += 16 * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); ptr_b0 += 4 * 2; - ptr_a0 += 16 * 2; } if (k_count > 0) { LOAD_A_PAIR_TAIL(0); + ptr_a0 += 16; BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); ptr_b0 += 4; - ptr_a0 += 16; } ptr_c0 = ptr_c; STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); @@ -342,19 +342,19 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); for (k_count = k; k_count > 1; k_count -=2) { MASK_LOAD_A_PAIR(0); + ptr_a0 += m_count * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); ptr_b0 += 4 * 2; - ptr_a0 += m_count * 2; } if (k_count > 0) { MASK_LOAD_A_PAIR_TAIL(0); + ptr_a0 += m_count; BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); ptr_b0 += 4; - ptr_a0 += m_count; } ptr_c0 = ptr_c; MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); From 7a2d1601ec84c146b01eeb227d65b51c7855d1ef Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 17 Aug 2021 23:21:19 +0800 Subject: [PATCH 099/143] sbgemm: cooperlake: unroll core loop by 2 --- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index 4c1f50650..0280b441e 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -201,7 +201,31 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * DECLARE_B_PAIR(); DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); - for (k_count = k; k_count > 1; k_count -=2) { + k_count = k; + for (; k_count > 3; k_count -=4) { + LOAD_A_PAIR(0); + ptr_a0 += 16 * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4 * 2; + + LOAD_A_PAIR(0); + ptr_a0 += 16 * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4 * 2; + } + for (; k_count > 1; k_count -=2) { LOAD_A_PAIR(0); ptr_a0 += 16 * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); From bb1c4fa5bdf93724075ed400e3ff5bbdabd0b31a Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 18 Aug 2021 21:17:08 +0800 Subject: [PATCH 100/143] sbgemm: cooperlake: prefetch A & B --- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index 0280b441e..7af51b6d8 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -64,6 +64,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DECLARE_B_PAIR() \ __m512i B_lo; __m512i B_hi; +#define PREFETCH_B_STEP 32 +#define PREFETCH_B(Bx, By) \ + if (By == 0) asm("prefetcht0 %c1(%0)": : "r"(ptr_b##Bx), "n"(PREFETCH_B_STEP * 2)); \ + else asm("prefetcht0 %c3(%0, %1, %c2)": : "r"(ptr_b##Bx), "r"(n_blksize), "n"(By*2), "n"(PREFETCH_B_STEP * 2)) + #define BROADCAST_B_PAIR(Bx, By) \ BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \ BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi); @@ -204,17 +209,19 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * k_count = k; for (; k_count > 3; k_count -=4) { LOAD_A_PAIR(0); + _mm_prefetch(ptr_a0 + 128, _MM_HINT_T0); ptr_a0 += 16 * 2; - BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); - BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); - BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + BROADCAST_B_PAIR(0, 0); PREFETCH_B(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); PREFETCH_B(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); PREFETCH_B(0, 2); MATMUL_4X(0, 0, 2); ptr_b0 += 4 * 2; - BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); - BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); - BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + BROADCAST_B_PAIR(1, 0); PREFETCH_B(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); PREFETCH_B(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); PREFETCH_B(1, 2); MATMUL_4X(0, 1, 2); ptr_b1 += 4 * 2; LOAD_A_PAIR(0); + _mm_prefetch(ptr_a0 + 128, _MM_HINT_T0); ptr_a0 += 16 * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); From 5fcacad32bb71fd6c6e04e078eeaf59120a9ba72 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 19 Aug 2021 00:08:06 +0800 Subject: [PATCH 101/143] sbgemm: cooperlake: implement tcopy_4 --- kernel/x86_64/sbgemm_tcopy_16_cooperlake.c | 1 + kernel/x86_64/sbgemm_tcopy_4_cooperlake.c | 86 ++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c index ce4458d2c..88725f343 100644 --- a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c +++ b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c @@ -160,4 +160,5 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ } } } + return 0; } diff --git a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c index afcf6f647..74f30d44a 100644 --- a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c +++ b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c @@ -26,8 +26,94 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include +#include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + IFLOAT *boffset0, *boffset1; + + boffset0 = b; + + BLASLONG n8 = n & ~7; + BLASLONG m4 = m & ~3; + BLASLONG m2 = m & ~1; + + for (j = 0; j < n8; j += 8) { + boffset1 = boffset0 + m * 4; + for (i = 0; i < m4; i +=4) { + __m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]); + __m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]); + __m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]); + __m128i a3 = _mm_loadu_si128((void *)&a[(i + 3)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + __m128i a01 = _mm_unpackhi_epi16(a0, a1); + __m128i a10 = _mm_unpacklo_epi16(a2, a3); + __m128i a11 = _mm_unpackhi_epi16(a2, a3); + _mm_storeu_si128((void *)(boffset0 + 0), a00); + _mm_storeu_si128((void *)(boffset0 + 8), a10); + _mm_storeu_si128((void *)(boffset1 + 0), a01); + _mm_storeu_si128((void *)(boffset1 + 8), a11); + boffset0 += 16; + boffset1 += 16; + } + for (; i < m2; i+= 2) { + __m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]); + __m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + __m128i a01 = _mm_unpackhi_epi16(a0, a1); + _mm_storeu_si128((void *)(boffset0 + 0), a00); + _mm_storeu_si128((void *)(boffset1 + 0), a01); + boffset0 += 8; + boffset1 += 8; + } + for (; i < m; i++) { + __m128d a0 = _mm_loadu_pd((void *)&a[(i + 0)*lda + j]); + _mm_store_sd((void *)boffset0, a0); + _mm_store_sd((void *)boffset1, _mm_permute_pd(a0, 0x1)); + boffset0 += 4; + boffset1 += 4; + } + boffset0 = boffset1; + } + if (j < n) { + uint32_t remains = n - j; + __mmask8 r_mask = (1UL << remains) - 1; + if (remains > 4) { + boffset1 = boffset0 + m * 4; + uint32_t tail1 = remains - 4; + __mmask8 w_mask1 = (1UL << tail1) - 1; + for (i = 0; i < m2; i += 2) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + __m128i a01 = _mm_unpackhi_epi16(a0, a1); + _mm_storeu_si128((void *)boffset0, a00); + _mm_mask_storeu_epi32((void *)boffset1, w_mask1, a01); + boffset0 += 8; + boffset1 += 2 * tail1; + } + for (; i < m; i++) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + _mm_store_sd((void *)boffset0, (__m128d) a0); + _mm_mask_storeu_epi16((void *)boffset1, w_mask1, (__m128i) _mm_permute_pd((__m128d) a0, 0x1)); + boffset0 += 4; + boffset1 += tail1; + } + } else { + for (i = 0; i < m2; i += 2) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + _mm_mask_storeu_epi32((void *)boffset0, r_mask, a00); + boffset0 += 2 * remains; + } + for (; i < m; i++) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + _mm_mask_storeu_epi16((void *)boffset0, r_mask, a0); + } + } + } + return 0; } From beccb83b167b50e3742aa113aab51e57d0e9baa2 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 19 Aug 2021 19:46:08 +0800 Subject: [PATCH 102/143] sbgemm: cooperlake: add n24 kernel for tcopy_4 --- kernel/x86_64/sbgemm_tcopy_4_cooperlake.c | 101 +++++++++++++++++++++- 1 file changed, 99 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c index 74f30d44a..e9edd4571 100644 --- a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c +++ b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c @@ -29,6 +29,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" +#define STORE_VEC(Bx, By, vec) \ + if (By == 0) asm("vmovdqu16 %0, (%1)": : "v"(vec), "r"(boffset##Bx)); \ + else asm("vmovdqu16 %0, (%1, %2, %c3)": : "v"(vec), "r"(boffset##Bx), "r"(blk_size), "n"(By * 2)); + int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; @@ -36,13 +40,106 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset0 = b; + BLASLONG n24 = n - (n % 24); BLASLONG n8 = n & ~7; + BLASLONG m8 = m & ~7; BLASLONG m4 = m & ~3; BLASLONG m2 = m & ~1; - for (j = 0; j < n8; j += 8) { + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + }; + + j = 0; + if (n > 23) { + /* n = 24 is the max width in current blocking setting */ + __m512i idx_lo_128 = _mm512_loadu_si512(permute_table); + __m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16); + __m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32); + __m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48); + __mmask32 mask24 = (1UL << 24) - 1; + BLASLONG blk_size = m * 4; + BLASLONG stride = blk_size * 3; + + for (; j < n24; j += 24) { + boffset1 = boffset0 + stride; + for (i = 0; i < m8; i += 8) { + __m512i r0, r1, r2, r3, r4, r5, r6, r7; + __m512i t0, t1, t2, t3, t4, t5, t6, t7; + r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]); + r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]); + r2 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 2)*lda + j]); + r3 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 3)*lda + j]); + r4 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 4)*lda + j]); + r5 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 5)*lda + j]); + r6 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 6)*lda + j]); + r7 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 7)*lda + j]); + + t0 = _mm512_unpacklo_epi16(r0, r1); + t1 = _mm512_unpackhi_epi16(r0, r1); + t2 = _mm512_unpacklo_epi16(r2, r3); + t3 = _mm512_unpackhi_epi16(r2, r3); + t4 = _mm512_unpacklo_epi16(r4, r5); + t5 = _mm512_unpackhi_epi16(r4, r5); + t6 = _mm512_unpacklo_epi16(r6, r7); + t7 = _mm512_unpackhi_epi16(r6, r7); + + r0 = _mm512_permutex2var_epi32(t0, idx_lo_128, t2); + r1 = _mm512_permutex2var_epi32(t1, idx_lo_128, t3); + r2 = _mm512_permutex2var_epi32(t4, idx_lo_128, t6); + r3 = _mm512_permutex2var_epi32(t5, idx_lo_128, t7); + r4 = _mm512_permutex2var_epi32(t0, idx_hi_128, t2); + r5 = _mm512_permutex2var_epi32(t1, idx_hi_128, t3); + r6 = _mm512_permutex2var_epi32(t4, idx_hi_128, t6); + r7 = _mm512_permutex2var_epi32(t5, idx_hi_128, t7); + + t0 = _mm512_permutex2var_epi32(r0, idx_lo_256, r2); + t1 = _mm512_permutex2var_epi32(r1, idx_lo_256, r3); + t2 = _mm512_permutex2var_epi32(r4, idx_lo_256, r6); + t3 = _mm512_permutex2var_epi32(r5, idx_lo_256, r7); + t4 = _mm512_permutex2var_epi32(r0, idx_hi_256, r2); + t5 = _mm512_permutex2var_epi32(r1, idx_hi_256, r3); + + STORE_VEC(0, 0, t0); STORE_VEC(0, 1, t1); STORE_VEC(0, 2, t2); + STORE_VEC(1, 0, t3); STORE_VEC(1, 1, t4); STORE_VEC(1, 2, t5); + boffset0 += 32; + boffset1 += 32; + } + for (; i < m2; i += 2) { + __m512i r0, r1, t0, t1; + r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]); + r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]); + t0 = _mm512_unpacklo_epi16(r0, r1); + t1 = _mm512_unpackhi_epi16(r0, r1); + STORE_VEC(0, 0, _mm512_extracti32x4_epi32(t0, 0)); + STORE_VEC(0, 1, _mm512_extracti32x4_epi32(t1, 0)); + STORE_VEC(0, 2, _mm512_extracti32x4_epi32(t0, 1)); + STORE_VEC(1, 0, _mm512_extracti32x4_epi32(t1, 1)); + STORE_VEC(1, 1, _mm512_extracti32x4_epi32(t0, 2)); + STORE_VEC(1, 2, _mm512_extracti32x4_epi32(t1, 2)); + boffset0 += 8; + boffset1 += 8; + } + for (; i < m; i++) { + *(uint64_t *)(boffset0 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 0]; + *(uint64_t *)(boffset0 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 4]; + *(uint64_t *)(boffset0 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 8]; + *(uint64_t *)(boffset1 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 12]; + *(uint64_t *)(boffset1 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 16]; + *(uint64_t *)(boffset1 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 20]; + boffset0 += 4; + boffset1 += 4; + } + boffset0 += stride * 2; + } + } + + for (; j < n8; j += 8) { boffset1 = boffset0 + m * 4; - for (i = 0; i < m4; i +=4) { + for (i = 0; i < m4; i += 4) { __m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]); __m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]); __m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]); From 682d66555d050dd31a48e5337815b5e1422d8f80 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 20 Aug 2021 22:01:00 +0800 Subject: [PATCH 103/143] sbgemm: cooperlake: implement ncopy_16 --- kernel/x86_64/sbgemm_ncopy_16_cooperlake.c | 320 +++++++++++++++++++++ 1 file changed, 320 insertions(+) diff --git a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c index afcf6f647..95ed82d7c 100644 --- a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c +++ b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c @@ -26,8 +26,328 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include +#include #include "common.h" +#define _MM512_SHUFFLE_i32(result, in1, in2, imm8) \ + asm("vshufps %3, %2, %1, %0": "=v"(result): "v"(in1), "v"(in2), "N"(imm8)) + +#define REORDER_8x32(t0, t1, t2, t3, t4, t5, t6, t7) { \ + __m512i v; \ + t0 = _mm512_unpacklo_epi32(r0, r1); \ + t1 = _mm512_unpackhi_epi32(r0, r1); \ + t2 = _mm512_unpacklo_epi32(r2, r3); \ + t3 = _mm512_unpackhi_epi32(r2, r3); \ + t4 = _mm512_unpacklo_epi32(r4, r5); \ + t5 = _mm512_unpackhi_epi32(r4, r5); \ + t6 = _mm512_unpacklo_epi32(r6, r7); \ + t7 = _mm512_unpackhi_epi32(r6, r7); \ + _MM512_SHUFFLE_i32(v, t0, t2, 0x4E); \ + r0 = _mm512_mask_blend_epi32(kc, t0, v); \ + r1 = _mm512_mask_blend_epi32(k3, t2, v); \ + _MM512_SHUFFLE_i32(v, t1, t3, 0x4E); \ + r2 = _mm512_mask_blend_epi32(kc, t1, v); \ + r3 = _mm512_mask_blend_epi32(k3, t3, v); \ + _MM512_SHUFFLE_i32(v, t4, t6, 0x4E); \ + r4 = _mm512_mask_blend_epi32(kc, t4, v); \ + r5 = _mm512_mask_blend_epi32(k3, t6, v); \ + _MM512_SHUFFLE_i32(v, t5, t7, 0x4E); \ + r6 = _mm512_mask_blend_epi32(kc, t5, v); \ + r7 = _mm512_mask_blend_epi32(k3, t7, v); \ + t0 = _mm512_permutex2var_epi32(r0, idx_lo, r4); \ + t1 = _mm512_permutex2var_epi32(r1, idx_lo, r5); \ + t2 = _mm512_permutex2var_epi32(r2, idx_lo, r6); \ + t3 = _mm512_permutex2var_epi32(r3, idx_lo, r7); \ + t4 = _mm512_permutex2var_epi32(r0, idx_hi, r4); \ + t5 = _mm512_permutex2var_epi32(r1, idx_hi, r5); \ + t6 = _mm512_permutex2var_epi32(r2, idx_hi, r6); \ + t7 = _mm512_permutex2var_epi32(r3, idx_hi, r7); \ +} + +#define STORE_512_LO(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ + _mm512_storeu_si512(boffset0 + x*32, v); + +#define STORE_512_HI(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ + _mm512_storeu_si512(boffset0 + (x + 8)*32, v); + +#define MASK_STORE_512_LO(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ + _mm512_mask_storeu_epi32(boffset0 + 2*x*remain_n, nmask, v); + +#define MASK_STORE_512_HI(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ + _mm512_mask_storeu_epi32(boffset0 + 2*(x + 8)*remain_n, nmask, v); + +#define STORE_512(x, y) {\ + __m512i v; \ + if (x == 0) { STORE_512_LO(y); } \ + else { STORE_512_HI(y); } \ +} + +#define MASK_STORE_512(x, y) {\ + __m512i v; \ + if (x == 0) { MASK_STORE_512_LO(y); } \ + else { MASK_STORE_512_HI(y); } \ +} + +#define SET_TAIL(y, x) {\ + if (y == 0) tail = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ + else tail = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ +} + +#define GET_TAIL() \ + switch (n_store + 1) { \ + case 16: SET_TAIL(1, 7); break; \ + case 15: SET_TAIL(1, 6); break; \ + case 14: SET_TAIL(1, 5); break; \ + case 13: SET_TAIL(1, 4); break; \ + case 12: SET_TAIL(1, 3); break; \ + case 11: SET_TAIL(1, 2); break; \ + case 10: SET_TAIL(1, 1); break; \ + case 9: SET_TAIL(1, 0); break; \ + case 8: SET_TAIL(0, 7); break; \ + case 7: SET_TAIL(0, 6); break; \ + case 6: SET_TAIL(0, 5); break; \ + case 5: SET_TAIL(0, 4); break; \ + case 4: SET_TAIL(0, 3); break; \ + case 3: SET_TAIL(0, 2); break; \ + case 2: SET_TAIL(0, 1); break; \ + case 1: SET_TAIL(0, 0); break; \ + } + + int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *boffset0; + IFLOAT *aoffset; + IFLOAT *aoffset00, *aoffset01, *aoffset02, *aoffset03, *aoffset04, *aoffset05, *aoffset06, *aoffset07; + IFLOAT *aoffset10, *aoffset11, *aoffset12, *aoffset13, *aoffset14, *aoffset15, *aoffset16, *aoffset17; + aoffset = a; + boffset0 = b; + + BLASLONG n16 = n & ~15; + BLASLONG m32 = m & ~31; + + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + }; + u_int64_t permute_table2[] = { + 0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3, + 0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7, + }; + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); + __m512i idx_lo2 = _mm512_loadu_si512(permute_table2); + __m512i idx_hi2 = _mm512_loadu_si512(permute_table2 + 8); + __mmask16 kc = 0xcccc; + __mmask16 k3 = 0x3333; + __m512i r0, r1, r2, r3, r4, r5, r6, r7; + __m512i t00, t01, t02, t03, t04, t05, t06, t07; + __m512i t10, t11, t12, t13, t14, t15, t16, t17; + + for (j = 0; j < n16; j += 16) { + aoffset00 = aoffset; + aoffset01 = aoffset00 + lda; + aoffset02 = aoffset01 + lda; + aoffset03 = aoffset02 + lda; + aoffset04 = aoffset03 + lda; + aoffset05 = aoffset04 + lda; + aoffset06 = aoffset05 + lda; + aoffset07 = aoffset06 + lda; + aoffset10 = aoffset07 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset17 = aoffset16 + lda; + aoffset += 16 * lda; + for (i = 0; i < m32; i += 32) { + r0 = _mm512_loadu_si512(aoffset00 + i); + r1 = _mm512_loadu_si512(aoffset01 + i); + r2 = _mm512_loadu_si512(aoffset02 + i); + r3 = _mm512_loadu_si512(aoffset03 + i); + r4 = _mm512_loadu_si512(aoffset04 + i); + r5 = _mm512_loadu_si512(aoffset05 + i); + r6 = _mm512_loadu_si512(aoffset06 + i); + r7 = _mm512_loadu_si512(aoffset07 + i); + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + r0 = _mm512_loadu_si512(aoffset10 + i); + r1 = _mm512_loadu_si512(aoffset11 + i); + r2 = _mm512_loadu_si512(aoffset12 + i); + r3 = _mm512_loadu_si512(aoffset13 + i); + r4 = _mm512_loadu_si512(aoffset14 + i); + r5 = _mm512_loadu_si512(aoffset15 + i); + r6 = _mm512_loadu_si512(aoffset16 + i); + r7 = _mm512_loadu_si512(aoffset17 + i); + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + STORE_512(0, 0); STORE_512(0, 1); STORE_512(0, 2); STORE_512(0, 3); + STORE_512(0, 4); STORE_512(0, 5); STORE_512(0, 6); STORE_512(0, 7); + STORE_512(1, 0); STORE_512(1, 1); STORE_512(1, 2); STORE_512(1, 3); + STORE_512(1, 4); STORE_512(1, 5); STORE_512(1, 6); STORE_512(1, 7); + boffset0 += 16 * 32; + } + if (i < m) { + int remain_m = m - i; + __mmask32 mmask = (1UL << remain_m) - 1; + r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i); + r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i); + r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i); + r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i); + r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i); + r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i); + r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i); + r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i); + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i); + r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i); + r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i); + r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i); + r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i); + r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i); + r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i); + r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i); + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + int n_store = remain_m/2; + switch (n_store) { + case 15: STORE_512(1, 6); + case 14: STORE_512(1, 5); + case 13: STORE_512(1, 4); + case 12: STORE_512(1, 3); + case 11: STORE_512(1, 2); + case 10: STORE_512(1, 1); + case 9: STORE_512(1, 0); + case 8: STORE_512(0, 7); + case 7: STORE_512(0, 6); + case 6: STORE_512(0, 5); + case 5: STORE_512(0, 4); + case 4: STORE_512(0, 3); + case 3: STORE_512(0, 2); + case 2: STORE_512(0, 1); + case 1: STORE_512(0, 0); + } + boffset0 += n_store * 32; + if (m & 0x1) { + __m512i tail; + GET_TAIL(); + _mm256_storeu_si256((void *)boffset0, _mm512_cvtepi32_epi16(tail)); + boffset0 += 16; + } + } + } + if (j < n) { + int remain_n = n - j; + __mmask16 nmask = (1UL << remain_n) - 1; + int load0, load1; + if (remain_n > 8) { + load0 = 8; + load1 = remain_n - 8; + } else { + load0 = remain_n; + load1 = 0; + } + aoffset00 = aoffset; + aoffset01 = aoffset00 + lda; + aoffset02 = aoffset01 + lda; + aoffset03 = aoffset02 + lda; + aoffset04 = aoffset03 + lda; + aoffset05 = aoffset04 + lda; + aoffset06 = aoffset05 + lda; + aoffset07 = aoffset06 + lda; + aoffset10 = aoffset07 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset17 = aoffset16 + lda; + aoffset += 16 * lda; + for (i = 0; i < m32; i += 32) { + switch (load0) { + case 8: r7 = _mm512_loadu_si512(aoffset07 + i); + case 7: r6 = _mm512_loadu_si512(aoffset06 + i); + case 6: r5 = _mm512_loadu_si512(aoffset05 + i); + case 5: r4 = _mm512_loadu_si512(aoffset04 + i); + case 4: r3 = _mm512_loadu_si512(aoffset03 + i); + case 3: r2 = _mm512_loadu_si512(aoffset02 + i); + case 2: r1 = _mm512_loadu_si512(aoffset01 + i); + case 1: r0 = _mm512_loadu_si512(aoffset00 + i); + } + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + switch (load1) { + case 8: r7 = _mm512_loadu_si512(aoffset17 + i); + case 7: r6 = _mm512_loadu_si512(aoffset16 + i); + case 6: r5 = _mm512_loadu_si512(aoffset15 + i); + case 5: r4 = _mm512_loadu_si512(aoffset14 + i); + case 4: r3 = _mm512_loadu_si512(aoffset13 + i); + case 3: r2 = _mm512_loadu_si512(aoffset12 + i); + case 2: r1 = _mm512_loadu_si512(aoffset11 + i); + case 1: r0 = _mm512_loadu_si512(aoffset10 + i); + } + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + MASK_STORE_512(0, 0); MASK_STORE_512(0, 1); MASK_STORE_512(0, 2); MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); MASK_STORE_512(0, 5); MASK_STORE_512(0, 6); MASK_STORE_512(0, 7); + MASK_STORE_512(1, 0); MASK_STORE_512(1, 1); MASK_STORE_512(1, 2); MASK_STORE_512(1, 3); + MASK_STORE_512(1, 4); MASK_STORE_512(1, 5); MASK_STORE_512(1, 6); MASK_STORE_512(1, 7); + boffset0 += remain_n * 32; + } + if (i < m) { + int remain_m = m - i; + __mmask32 mmask = (1UL << remain_m) - 1; + switch (load0) { + case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i); + case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i); + case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i); + case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i); + case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i); + case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i); + case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i); + case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i); + } + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + switch (load1) { + case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i); + case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i); + case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i); + case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i); + case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i); + case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i); + case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i); + case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i); + } + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + int n_store = remain_m/2; + switch (n_store) { + case 15: MASK_STORE_512(1, 6); + case 14: MASK_STORE_512(1, 5); + case 13: MASK_STORE_512(1, 4); + case 12: MASK_STORE_512(1, 3); + case 11: MASK_STORE_512(1, 2); + case 10: MASK_STORE_512(1, 1); + case 9: MASK_STORE_512(1, 0); + case 8: MASK_STORE_512(0, 7); + case 7: MASK_STORE_512(0, 6); + case 6: MASK_STORE_512(0, 5); + case 5: MASK_STORE_512(0, 4); + case 4: MASK_STORE_512(0, 3); + case 3: MASK_STORE_512(0, 2); + case 2: MASK_STORE_512(0, 1); + case 1: MASK_STORE_512(0, 0); + } + boffset0 += n_store * remain_n * 2; + if (m & 0x1) { + __m512i tail; + GET_TAIL(); + _mm256_mask_storeu_epi16((void *)boffset0, nmask, _mm512_cvtepi32_epi16(tail)); + } + } + } + return 0; } From 59a1114d03b59794ae46eb6ae60b9a3b4b842709 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 7 Sep 2021 18:12:40 +0800 Subject: [PATCH 104/143] sbgemm: cooperlake: tuning for small matrix --- kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c index 823aafbdd..70becd9fa 100644 --- a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c +++ b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c @@ -38,5 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) { - return 1; + double MNK = (double) M * (double) N * (double) K; + if (MNK > 256.0*256.0*256.0) // disable for big size matrix + return 0; + /* small matrix kernel works well for N = 8, 16, 32 */ + if (N == 8 || N == 16 || N == 32) + return 1; + return 0; } From 4289cf048dc1b5b735f65a3183f2c903c8f090bc Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 7 Sep 2021 18:34:26 +0800 Subject: [PATCH 105/143] sbgemm: avoid falling into SGEMM_KERNEL_DIRECT --- interface/gemm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/gemm.c b/interface/gemm.c index 6dcc54041..71cc77a1b 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -326,7 +326,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; -#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT) +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) #ifdef DYNAMIC_ARCH if (support_avx512() ) #endif From 045ed5c91df1e4d330ff1a3e93a721f98552692b Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 7 Sep 2021 23:37:08 +0800 Subject: [PATCH 106/143] sbgemm: fix build error in BFLOAT16 disabled --- driver/others/parameter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index d7dbddc7c..791e5dc27 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -524,7 +524,9 @@ void blas_set_parameter(void){ xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; #endif +#ifdef BUILD_BFLOAT16 sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; +#endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; @@ -630,7 +632,9 @@ void blas_set_parameter(void){ xgemm_p = 16 * (size + 1); #endif +#ifdef BUILD_BFLOAT16 sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; +#endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; From b858e65476b0ece1ccd082c62dd23d5ff1cb44b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 Sep 2021 10:51:59 +0200 Subject: [PATCH 107/143] migrate from deprecated ubuntu-16.04 vmImage --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index b1bded639..5d4a1ecd3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -19,7 +19,7 @@ jobs: # of gcc / glibc - job: manylinux1_gcc pool: - vmImage: 'ubuntu-16.04' + vmImage: 'ubuntu-latest' steps: - script: | echo "FROM quay.io/pypa/manylinux1_x86_64 @@ -35,7 +35,7 @@ jobs: displayName: Run manylinux1 docker build - job: Intel_SDE_skx pool: - vmImage: 'ubuntu-16.04' + vmImage: 'ubuntu-latest' steps: - script: | # at the time of writing the available Azure Ubuntu vm image From 7f4aa106f27d11cfa7e394238f222cca4f93d1bd Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 8 Sep 2021 07:04:13 -0500 Subject: [PATCH 108/143] Fixing syntax error in makefile Fixing syntax issue in Makefile.power added by recent commit af19cda65aef4d033ae33213013c88b0a99f9da2 --- Makefile.power | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.power b/Makefile.power index 4e7478213..28a0bae08 100644 --- a/Makefile.power +++ b/Makefile.power @@ -12,7 +12,7 @@ endif ifeq ($(CORE), POWER10) ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math -ifeq ($(F_COMPILER, IBM) +ifeq ($(F_COMPILER), IBM) FCOMMON_OPT += -O2 -qrecur -qnosave else FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math From d17238599b573350b166973619039e67fba12fdd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 13:38:28 +0200 Subject: [PATCH 109/143] Add casts --- kernel/x86_64/dasum_microk_haswell-2.c | 16 ++++++++-------- kernel/x86_64/sasum_microk_haswell-2.c | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index 4fc73ddd4..fd9da7ebe 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -38,10 +38,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); for (i = 0; i < tail_index_AVX2; i += 16) { - accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask); - accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); - accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask); + accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask); + accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 4]), abs_mask); + accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask); + accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+12]), abs_mask); } accum_0 = accum_0 + accum_1 + accum_2 + accum_3; @@ -63,10 +63,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { - accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); - accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); - accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2); } accum_20 = accum_20 + accum_21 + accum_22 + accum_23; diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index 8e6cb9a47..2eb5b9538 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -38,10 +38,10 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); for (i = 0; i < tail_index_AVX2; i += 32) { - accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); - accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask); - accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask); + accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask); + accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask); + accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+16]), abs_mask); + accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+24]), abs_mask); } accum_0 = accum_0 + accum_1 + accum_2 + accum_3; @@ -62,8 +62,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { - accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); } accum_20 += accum_21; From 20581bf303776f831c788ced24f179d720ec5c39 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 14:36:27 +0200 Subject: [PATCH 110/143] Remove unused variable --- interface/zsyr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/zsyr.c b/interface/zsyr.c index 71d4dbf29..c70bd819e 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; FLOAT * ALPHA = α FLOAT alpha_r = ALPHA[0]; From ef2471203068b64d648b1495c9399bc18e802788 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 14:37:44 +0200 Subject: [PATCH 111/143] Move a conditionally used variable --- kernel/generic/dot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/generic/dot.c b/kernel/generic/dot.c index 5abbb735c..84568ee0b 100644 --- a/kernel/generic/dot.c +++ b/kernel/generic/dot.c @@ -47,7 +47,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -4; #if V_SIMD && !defined(DSDOT) const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); @@ -84,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } dot = v_sum_f32(vsum0); #elif defined(DSDOT) + int n1 = n & -4; for (; i < n1; i += 4) { dot += (double) y[i] * (double) x[i] @@ -92,6 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) + (double) y[i+3] * (double) x[i+3] ; } #else + int n1 = n & -4; for (; i < n1; i += 4) { dot += y[i] * x[i] From 7d873a329f477c676b39719d4f83a87a506cc0b9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 14:38:47 +0200 Subject: [PATCH 112/143] Add ifdefs around conditionally used functions --- kernel/x86_64/sgemv_n_4.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 90865c4b3..0d8cada75 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -115,6 +115,8 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT #endif +#ifndef HAVE_SGEMV_N_SKYLAKE_KERNEL + #ifndef HAVE_KERNEL_4x2 static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -246,6 +248,8 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a #endif +#endif + static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) From 1085775bc68c7de6e4a93c0d920b5564c8e84706 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 15:05:55 +0200 Subject: [PATCH 113/143] really remove the unused variable --- interface/zsyr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/interface/zsyr.c b/interface/zsyr.c index c70bd819e..54fb8a4e9 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -130,7 +130,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; From 0925dfe2c9a287f1fadfd20ea718e89b722c4de0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 15:30:19 +0200 Subject: [PATCH 114/143] One instance of kernel_4x1 is used even on SKX --- kernel/x86_64/sgemv_n_4.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 0d8cada75..e0778006f 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -172,6 +172,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT } +#endif #endif #ifndef HAVE_KERNEL_4x1 @@ -248,8 +249,6 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a #endif -#endif - static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) From 5e4f1e3677df7ca74fd9d3dd264de8ca095f0553 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:09:46 +0200 Subject: [PATCH 115/143] Remove BFLOAT16 from the task list of GenerateNamedObject --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0330b2ce7..ef7457135 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,7 +132,7 @@ endif () if (BUILD_BFLOAT16) message(STATUS "Building Half Precision") - list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing + # list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing endif () if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") From 1c0a8a714a5b00b1773c8a91b9cd155007b10480 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:10:58 +0200 Subject: [PATCH 116/143] Add defaults for SBGEMV kernels --- cmake/kernel.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 0c102bae5..09ca5eb57 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -134,6 +134,8 @@ if (BUILD_BFLOAT16) set(SHSWAPKERNEL ../arm/swap.c) set(TOBF16KERNEL ../x86_64/tobf16.c) set(BF16TOKERNEL ../x86_64/bf16to.c) + set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) + set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) endif () endmacro () From e02df9fc55d96388951901420d6be9ff9e404228 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:12:27 +0200 Subject: [PATCH 117/143] Propagate BUILD_BFLOAT16 to CFLAGS --- cmake/system.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 7d2672998..f56ded966 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -469,6 +469,9 @@ endif() if (BUILD_COMPLEX16) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") endif() +if (BUILD_BFLOAT16) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16") +endif() if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") endif() From 5f6a6092537f156d14e11bd5cd6f6b15c3f861ca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:13:57 +0200 Subject: [PATCH 118/143] Add sbgemv --- driver/level2/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 61367e596..3e9964ab1 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type}) endif () +# special defines for complex if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") foreach (u_source ${U_SOURCES}) @@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES}) endif () endforeach () +if (BUILD_BFLOAT16) + if (USE_THREAD) + GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16") + GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16") + endif () +endif () + if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) if (USE_THREAD) GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE") From 2f8220d757e9db0d4b748232cbdb2582ff64f611 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:14:43 +0200 Subject: [PATCH 119/143] Add sbgemm --- driver/level3/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 077862abc..75b25d039 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) endif () + if (BUILD_BFLOAT16) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") + if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") + endif () + endif () endforeach () if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) From c35739db5ee784ba5a210441b0f30962a2f36b01 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:15:57 +0200 Subject: [PATCH 120/143] Add separate entries for BFLOAT16 functions and fix missing cblas_xerbla --- interface/CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 5346ecadd..ccb5fce3f 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -82,6 +82,7 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + GenerateNamedObjects("xerbla.c" "" "xerbla" ${CBLAS_FLAG} "" "" true) #sdsdot, dsdot if (BUILD_SINGLE OR BUILD_DOUBLE) GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") @@ -104,6 +105,15 @@ endif () GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG}) +if (BUILD_BFLOAT16) + GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16") +endif () # complex-specific sources foreach (float_type ${FLOAT_TYPES}) From ddf106f769637cbfa09ee3c3dbe3bfe4cb04ef56 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:17:18 +0200 Subject: [PATCH 121/143] Add dedicated entries for BFLOAT16 kernels --- kernel/CMakeLists.txt | 105 ++++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 25 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d8a230436..9ffbd944f 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -91,6 +91,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") + # sbdot + if (BUILD_BFLOAT16) + GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "SINGLE" "stobf16_k" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "DOUBLE" "dtobf16_k" false "" "" false "BFLOAT16") + endif() + if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE") @@ -149,9 +158,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "BFLOAT16") - set (float_char "SB") - endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) @@ -185,6 +191,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") endif () + if (BUILD_BFLOAT16) + GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16") + endif () # Makefile.L3 set(USE_TRMM false) string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) @@ -209,15 +219,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) endif() - foreach (float_type SINGLE DOUBLE BFLOAT16) + foreach (float_type SINGLE DOUBLE) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "BFLOAT16") - if (NOT ${BUILD_BFLOAT16}) - continue () - else () - set (float_char "SB") - endif () - endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) @@ -253,11 +256,24 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE") endif () + if (BUILD_BFLOAT16) + if (SBGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (SBGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMITCOPY}" "" "${SBGEMMITCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (SBGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMONCOPY}" "" "${SBGEMMONCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (SBGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMOTCOPY}" "" "${SBGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16") + endif () foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "BFLOAT16") - set (float_char "SB") - endif () if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) endif () @@ -568,6 +584,44 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) endif () + if (BUILD_BFLOAT16) + if (NOT DEFINED SBGEMM_SMALL_M_PERMIT) + set(SBGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_NN) + set(SBGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_NT) + set(SBGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_TN) + set(SBGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_TT) + set(SBGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_NN) + set(SBGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_NT) + set(SBGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_TN) + set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_TT) + set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") + endif () endif () if (NOT DEFINED ${float_char}OMATCOPY_CN) @@ -702,6 +756,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) #geadd GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) endforeach () + if (BUILD_DOUBLE AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE") @@ -840,22 +895,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE") if (SGEMMINCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") endif () - if (SGEMMITCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") - endif () - if (SGEMMONCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") - endif () - if (SGEMMOTCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") + if (SGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") endif () GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") endif () - - if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) + + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE") GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE") endif () From ce036a2fc0a593a780a7ecd12933afd93e265e85 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 21:41:53 +0200 Subject: [PATCH 122/143] Add casts --- kernel/x86_64/dasum_microk_skylakex-2.c | 8 ++++---- kernel/x86_64/sasum_microk_skylakex-2.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c index aea8c02d9..83bc078b3 100644 --- a/kernel/x86_64/dasum_microk_skylakex-2.c +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -58,10 +58,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { - accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); - accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); - accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2); } accum_20 = accum_20 + accum_21 + accum_22 + accum_23; diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c index c8c69d1e0..fbc91b558 100644 --- a/kernel/x86_64/sasum_microk_skylakex-2.c +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -53,8 +53,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { - accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); } accum_20 += accum_21; From dd09f0173e90f98ec382ef5ce1ddf4d1eb7c67e8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 21:52:26 +0200 Subject: [PATCH 123/143] Remove extraneous qualifiers from struct definition --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 48067923e..0185fa683 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2695,7 +2695,7 @@ static volatile struct { } memory[NUM_BUFFERS]; -static volatile struct newmemstruct +struct newmemstruct { BLASULONG lock; void *addr; From b751edf6248e1897d1966d4693b2be980b89f518 Mon Sep 17 00:00:00 2001 From: Rafael Cardoso Fernandes Sousa Date: Wed, 15 Sep 2021 13:36:07 -0500 Subject: [PATCH 124/143] Fix unused variable warnings on Power --- kernel/power/drot.c | 4 +--- kernel/power/idamax.c | 2 +- kernel/power/trsm_kernel_LN_power10.c | 1 - kernel/power/trsm_kernel_LT_power10.c | 1 - kernel/power/zgemv_n_4.c | 1 - kernel/power/zgemv_n_power10.c | 1 - 6 files changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/power/drot.c b/kernel/power/drot.c index 3229878e4..30c7411cc 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -110,8 +110,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT { BLASLONG i=0; BLASLONG ix=0,iy=0; - FLOAT *x1=x; - FLOAT *y1=y; FLOAT temp; if ( n <= 0 ) return(0); @@ -139,7 +137,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -16; if ( n1 > 0 ) { - drot_kernel_16(n1, x1, y1, c, s); + drot_kernel_16(n1, x, y, c, s); i=n1; } #endif diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 5016f67dd..f1ef00066 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -330,10 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (inc_x == 1) { - BLASLONG n1 = n & -32; #if defined(_CALL_ELF) && (_CALL_ELF == 2) #if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -32; if (n1 > 0) { max = diamax_kernel_32(n1, x, &maxf); diff --git a/kernel/power/trsm_kernel_LN_power10.c b/kernel/power/trsm_kernel_LN_power10.c index 5ca1603a6..246c3a236 100644 --- a/kernel/power/trsm_kernel_LN_power10.c +++ b/kernel/power/trsm_kernel_LN_power10.c @@ -389,7 +389,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, vector FLOAT *Vc6 = (vector FLOAT *) c6; vector FLOAT *Vc7 = (vector FLOAT *) c7; vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; - int j; b[120] = (c0[15] *= a[255]); b[121] = (c1[15] *= a[255]); diff --git a/kernel/power/trsm_kernel_LT_power10.c b/kernel/power/trsm_kernel_LT_power10.c index 14ff12fe4..51f3a4e61 100644 --- a/kernel/power/trsm_kernel_LT_power10.c +++ b/kernel/power/trsm_kernel_LT_power10.c @@ -390,7 +390,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, vector FLOAT *Vc6 = (vector FLOAT *) c6; vector FLOAT *Vc7 = (vector FLOAT *) c7; vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; - int j; b[0] = (c0[0] *= a[0]); b[1] = (c1[0] *= a[0]); diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c index 1f7199c89..366c21681 100644 --- a/kernel/power/zgemv_n_4.c +++ b/kernel/power/zgemv_n_4.c @@ -607,7 +607,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { BLASLONG i; - BLASLONG j; FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; diff --git a/kernel/power/zgemv_n_power10.c b/kernel/power/zgemv_n_power10.c index f5bb8d70e..a545b00d8 100644 --- a/kernel/power/zgemv_n_power10.c +++ b/kernel/power/zgemv_n_power10.c @@ -738,7 +738,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { BLASLONG i; - BLASLONG j; FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; From 99aa10b3ff8870f4718fc842ce80871247cb93af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Sep 2021 22:10:43 +0200 Subject: [PATCH 125/143] Initialize abs_mask1 with itself to silence a gcc warning actual initialization is via the _mm_cmpeq_ep18, which I've seen claimed to be the fastest way to set an xmm register to all 1s --- kernel/x86_64/casum_microk_skylakex-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index d51929f9f..b398aa6e1 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -15,7 +15,7 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x) if (n2 < 64) { __m128 accum_10, accum_11, accum_12, accum_13; - __m128 abs_mask1; + __m128 abs_mask1 = abs_mask1; accum_10 = _mm_setzero_ps(); accum_11 = _mm_setzero_ps(); From 8dfa61a61c0b6d1f9a742e3dc2ae455bb3703cc8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Sep 2021 22:11:35 +0200 Subject: [PATCH 126/143] Initialize abs_mask1 with itself to silence a gcc warning --- kernel/x86_64/zasum_microk_skylakex-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index b44c53801..e257a5456 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -16,7 +16,7 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) if (n2 < 32) { __m128d accum_10, accum_11, accum_12, accum_13; - __m128d abs_mask1; + __m128d abs_mask1 = abs_mask1; accum_10 = _mm_setzero_pd(); accum_11 = _mm_setzero_pd(); From 0e8b4adf22981f3bd8f80e7e1f9e58edec54a598 Mon Sep 17 00:00:00 2001 From: Rafael Cardoso Fernandes Sousa Date: Wed, 15 Sep 2021 22:18:48 +0000 Subject: [PATCH 127/143] Remove unused commented code (#if directive) --- driver/others/dynamic_power.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index d9c15b312..2847ea9ae 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif -//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ -// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) -//#define HAVE_P10_SUPPORT 1 -//#endif #ifdef HAVE_P10_SUPPORT extern gotoblas_t gotoblas_POWER10; #endif From 7d4a2215799772a4d81a3d3e3b8d7faa515c68b1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Sep 2021 09:18:25 +0200 Subject: [PATCH 128/143] Remove unused TEMP2 and reshuffle to leave x18 unused (reserved on OSX) --- kernel/arm64/dgemm_tcopy_8.S | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/dgemm_tcopy_8.S b/kernel/arm64/dgemm_tcopy_8.S index 9ab51ff57..7e5bf6080 100644 --- a/kernel/arm64/dgemm_tcopy_8.S +++ b/kernel/arm64/dgemm_tcopy_8.S @@ -50,11 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B03 x16 #define B04 x17 -#define I x18 -#define J x19 +#define I x19 +#define J x20 -#define TEMP1 x20 -#define TEMP2 x21 +#define TEMP1 x21 #define A_PREFETCH 2560 #define B_PREFETCH 256 From 0a4ac4b5850b5dee9f285637f06a4594f2e10dc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Sep 2021 09:19:51 +0200 Subject: [PATCH 129/143] Use x21 for I to leave x18 unused (reserved on OSX) --- kernel/arm64/sgemm_tcopy_16.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S index 46198b3a2..431f1ae2a 100644 --- a/kernel/arm64/sgemm_tcopy_16.S +++ b/kernel/arm64/sgemm_tcopy_16.S @@ -30,7 +30,7 @@ All rights reserved. #define B00 x22 -#define I x18 +#define I x21 #define J x19 #define TEMP1 x20 From 7d751774465637c25ef45d8c0f2a2361553e3df4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Sep 2021 09:24:11 +0200 Subject: [PATCH 130/143] Move temp to x21 to leave x18 unused (reserved on OSX) --- kernel/arm64/dtrmm_kernel_8x4.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S index 0ac5a5f24..3d953266c 100644 --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alpha x17 -#define temp x18 +//#define temp x18 #define tempOffset x19 #define tempK x20 +#define temp x21 #define alpha0 d10 #define alphaV0 v10.d[0] From 380940271b7647cc82000b4f34d681a3259d222f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Sep 2021 09:28:19 +0200 Subject: [PATCH 131/143] Move temp to x21 to leave x18 unused (reserved on OSX) --- kernel/arm64/strmm_kernel_16x4.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S index 985a0a9a6..a44326aeb 100644 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alpha w17 -#define temp x18 +//#define temp x18 #define tempOffset x19 #define tempK x20 +#define temp x21 #define alpha0 s10 #define alphaV0 v10.s[0] From 590fbff06e818c3135a0b80cfae5a471da7f4e09 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Sep 2021 09:42:17 +0200 Subject: [PATCH 132/143] move alpha to x19/x20 to leave x18 unused for OSX --- kernel/arm64/zgemm_kernel_4x4.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index f8e877f3c..a65c4f581 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow2 x14 #define pCRow3 x15 #define pA x16 -#define alphaR x17 -#define alphaI x18 +#define alphaR x19 +#define alphaI x20 #define alpha0_R d10 #define alphaV0_R v10.d[0] From 90cc944625ce0405145bdde03af0bf4e19e3f1ce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Sep 2021 09:53:18 +0200 Subject: [PATCH 133/143] Move alphaI to x22 to leave x18 unused (reserved on OSX) --- kernel/arm64/ztrmm_kernel_4x4.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 462acfe2b..cd053b896 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR x17 -#define alphaI x18 +#define alphaI x22 #define temp x19 #define tempOffset x20 #define tempK x21 From 5c537a5de07909f66c64cd8128c4a44df6ac8ba4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Sep 2021 14:54:35 +0200 Subject: [PATCH 134/143] Update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 88a5a5035..6ce85e08e 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. - **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. +- **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. @@ -153,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS - **Cortex-A53**: same as ARMV8 (different cpu specifications) +- **Cortex-A55**: same as ARMV8 (different cpu specifications) - **Cortex A57**: Optimized Level-3 and Level-2 functions - **Cortex A72**: same as A57 ( different cpu specifications) - **Cortex A73**: same as A57 (different cpu specifications) @@ -178,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th #### RISC-V -- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. +- **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. ```sh make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran ``` + (also known to work on C906) ### Support for multiple targets in a single library From b7bb2e36b8b8197bf4ae794b0982dde0336e17bc Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Sun, 26 Sep 2021 12:17:21 +0300 Subject: [PATCH 135/143] Makefile.system: adjust mipsel/mips64el ARCH variables When building for MIPS{64} little-endian variants, the included makefiles should be the same as for the big-endian. There are already some adjustments being done for some ARCH names. This change adds the ones for the `mipsel` and `mips64el` names, so that the Makefile.mips{64} files get included. This comes as a result of: https://github.com/openwrt/packages/issues/16649 Signed-off-by: Alexandru Ardelean --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 20db80d07..150dbef50 100644 --- a/Makefile.system +++ b/Makefile.system @@ -33,6 +33,10 @@ else ifeq ($(ARCH), armv7) override ARCH=arm else ifeq ($(ARCH), aarch64) override ARCH=arm64 +else ifeq ($(ARCH), mipsel) +override ARCH=mips +else ifeq ($(ARCH), mips64el) +override ARCH=mips64 else ifeq ($(ARCH), zarch) override ARCH=zarch endif From ee5ca8a328bae3da45a15452e9772c67165fabe0 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 28 Sep 2021 18:22:15 +0800 Subject: [PATCH 136/143] x86_64: BFLOAT16: fix build warning --- kernel/x86_64/bf16_common_macros.h | 36 ++++---- kernel/x86_64/sbdot_microk_cooperlake.c | 14 +-- .../x86_64/sbgemm_block_microk_cooperlake.c | 2 +- .../sbgemv_n_microk_cooperlake_template.c | 11 ++- .../sbgemv_t_microk_cooperlake_template.c | 91 +++++++++++++------ 5 files changed, 100 insertions(+), 54 deletions(-) diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h index 78db7abb2..cdb4beff6 100644 --- a/kernel/x86_64/bf16_common_macros.h +++ b/kernel/x86_64/bf16_common_macros.h @@ -56,25 +56,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \ - regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \ - regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \ - regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \ - regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \ - regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \ - regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \ - regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \ - regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]); + regArray##_0 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+0)*lda + idx_n])); \ + regArray##_1 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+1)*lda + idx_n])); \ + regArray##_2 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+2)*lda + idx_n])); \ + regArray##_3 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+3)*lda + idx_n])); \ + regArray##_4 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+4)*lda + idx_n])); \ + regArray##_5 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+5)*lda + idx_n])); \ + regArray##_6 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+6)*lda + idx_n])); \ + regArray##_7 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+7)*lda + idx_n])); #define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \ - regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \ - regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \ - regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \ - regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \ - regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \ - regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \ - regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \ - regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]); + regArray##_0 = _mm_loadu_si128((__m128i *)(&a[(idx_m+0)*lda + idx_n])); \ + regArray##_1 = _mm_loadu_si128((__m128i *)(&a[(idx_m+1)*lda + idx_n])); \ + regArray##_2 = _mm_loadu_si128((__m128i *)(&a[(idx_m+2)*lda + idx_n])); \ + regArray##_3 = _mm_loadu_si128((__m128i *)(&a[(idx_m+3)*lda + idx_n])); \ + regArray##_4 = _mm_loadu_si128((__m128i *)(&a[(idx_m+4)*lda + idx_n])); \ + regArray##_5 = _mm_loadu_si128((__m128i *)(&a[(idx_m+5)*lda + idx_n])); \ + regArray##_6 = _mm_loadu_si128((__m128i *)(&a[(idx_m+6)*lda + idx_n])); \ + regArray##_7 = _mm_loadu_si128((__m128i *)(&a[(idx_m+7)*lda + idx_n])); #define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \ @@ -153,11 +153,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \ - reg = _mm256_loadu_si256(x + idx_n); + reg = _mm256_loadu_si256((__m256i *)(x + idx_n)); #define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \ - reg = _mm_loadu_si128(x + idx_n); + reg = _mm_loadu_si128((__m128i *)(x + idx_n)); #define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \ diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c index 067726cb1..2aefe46ff 100644 --- a/kernel/x86_64/sbdot_microk_cooperlake.c +++ b/kernel/x86_64/sbdot_microk_cooperlake.c @@ -79,21 +79,21 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) __m256 accum256_1 = _mm256_setzero_ps(); int tail_index_32 = n&(~31); for (int j = 0; j < tail_index_32; j += 32) { - accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0])); - accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16])); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+ 0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+ 0])); + accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+16]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+16])); } accum256 = _mm256_add_ps(accum256, accum256_1); /* Processing the remaining <32 chunk with 16-elements processing */ if ((n&16) != 0) { - accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32])); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[tail_index_32]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[tail_index_32])); } accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); /* Processing the remaining <16 chunk with 8-elements processing */ if ((n&8) != 0) { int tail_index_16 = n&(~15); - accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16])); } /* Processing the remaining <8 chunk with masked 8-elements processing */ @@ -108,13 +108,13 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) } else if (n > 15) { /* n range from 16 to 31 */ /* Processing <32 chunk with 16-elements processing */ __m256 accum256 = _mm256_setzero_ps(); - accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0])); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[0])); accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); /* Processing the remaining <16 chunk with 8-elements processing */ if ((n&8) != 0) { int tail_index_16 = n&(~15); - accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16])); } /* Processing the remaining <8 chunk with masked 8-elements processing */ @@ -128,7 +128,7 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) } } else if (n > 7) { /* n range from 8 to 15 */ /* Processing <16 chunk with 8-elements processing */ - accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0])); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[0]), (__m128bh) _mm_loadu_si128((__m128i *)&y[0])); /* Processing the remaining <8 chunk with masked 8-elements processing */ if ((n&7) != 0) { diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c index 2c27221ac..b8c41f4f7 100644 --- a/kernel/x86_64/sbgemm_block_microk_cooperlake.c +++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c @@ -1246,7 +1246,7 @@ void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat // K=Any number but will be processed based on 32, M<=16 void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) { - bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * src_addr0; bfloat16 * dst_addr0, * dst_addr1; BLASLONG tag_k_32x = k & (~31); diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c index 46e6d0ff9..4711e9720 100644 --- a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c @@ -30,6 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Include common macros for BF16 based operations with IA intrinsics #include "bf16_common_macros.h" +#undef STORE16_COMPLETE_RESULT +#undef STORE16_MASK_COMPLETE_RESULT +#undef STORE8_COMPLETE_RESULT +#undef STORE8_MASK_COMPLETE_RESULT +#undef STORE4_COMPLETE_RESULT +#undef STORE4_MASK_COMPLETE_RESULT + #ifndef ZERO_BETA // Beta is non-zero #ifndef ONE_BETA // BETA is not ONE @@ -103,7 +110,9 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3; @@ -202,7 +211,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31))); __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); - unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + unsigned int store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value); accum512_0 = _mm512_setzero_ps(); diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c index 51e681add..8a3a022fb 100644 --- a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c @@ -29,6 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Include common macros for BF16 based operations with IA intrinsics #include "bf16_common_macros.h" +#undef STORE16_COMPLETE_RESULT +#undef STORE16_MASK_COMPLETE_RESULT +#undef STORE8_COMPLETE_RESULT +#undef STORE8_MASK_COMPLETE_RESULT +#undef STORE4_COMPLETE_RESULT +#undef STORE4_MASK_COMPLETE_RESULT + #ifndef ZERO_BETA // Beta is non-zero #ifndef ONE_BETA // BETA is not ONE @@ -231,7 +238,9 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif unsigned char load_mask_value = (((unsigned char)0xff) >> 6); @@ -280,7 +289,7 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, } else if (tail_num == 8) { __m256 result256 = _mm256_setzero_ps(); - __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]); // Load 8 rows with n=2 + __m256i matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*2]); // Load 8 rows with n=2 __m256i xArray256 = _mm512_castsi512_si256(xArray); result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256); @@ -323,7 +332,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); @@ -395,9 +406,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, result256_0 = _mm256_setzero_ps(); result256_1 = _mm256_setzero_ps(); - matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element - matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element - matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row @@ -423,8 +434,8 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, if (tail_num > 10) { unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1))); __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); - matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element - matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]); // Load m-tag_m_32x-10 rows matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row @@ -439,7 +450,7 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, } else if (tail_num > 5) { unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2))); __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); - matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]); // Load m-tag_m_32x-5 rows matrixArray256_2 = _mm256_setzero_si256(); @@ -499,7 +510,9 @@ static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_1 = _mm512_set1_epi32(1); @@ -591,7 +604,9 @@ static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512 result_0, result_1; @@ -782,7 +797,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_1 = _mm512_set1_epi32(1); @@ -866,9 +883,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, result256_0 = _mm256_setzero_ps(); - matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element - matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element - matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element + matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element + matrixArray_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element + matrixArray_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element // Process the 0|1 elements // Select the 0|1 elements for each row @@ -957,7 +974,9 @@ static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_2 = _mm512_set1_epi32(2); @@ -1110,7 +1129,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, { BLASLONG tag_m_16x = m & (~15); - __m128i x128 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| if (tag_m_16x > 0) { __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; @@ -1122,7 +1141,9 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_2 = _mm512_set1_epi32(2); @@ -1214,7 +1235,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m128 result128, tmp128; for (BLASLONG i = tag_m_16x; i < m; i++) { result128 = _mm_setzero_ps(); - matrixArray128 = _mm_loadu_si128(&a[(i)*8]); // Load 1 rows with n=8 + matrixArray128 = _mm_loadu_si128((__m128i *)&a[(i)*8]); // Load 1 rows with n=8 result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); tmp128 = _mm_shuffle_ps(result128, result128, 14); result128 = _mm_add_ps(result128, tmp128); @@ -1258,7 +1279,7 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|0 |0 | 0| 0| 0| 0| 0| if (tag_m_14x > 0) { @@ -1271,7 +1292,9 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m256i M256_EPI16_2 = _mm256_set1_epi16(2); @@ -1390,7 +1413,7 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8)); // |x8|x9|0 | 0| 0| 0| 0| 0| if (tag_m_12x > 0) { @@ -1403,7 +1426,9 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m256i M256_EPI32_1 = _mm256_set1_epi32(1); @@ -1522,7 +1547,7 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2|x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2|x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10| 0| 0| 0| 0| 0| if (tag_m_15x > 0) { @@ -1535,7 +1560,9 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; @@ -1690,7 +1717,7 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2| x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2| x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10|x11| 0| 0| 0| 0| if (tag_m_15x > 0) { @@ -1703,7 +1730,9 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; @@ -1873,16 +1902,15 @@ static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); - unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6); - __mmask32 load_mask = *((__mmask32*) &load_mask_value); - // Prepare X with 2-step interleave way xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); BF16_INTERLEAVE_1x32(xArray) @@ -2045,7 +2073,9 @@ static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); @@ -2207,16 +2237,15 @@ static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); - unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2); - __mmask32 load_mask = *((__mmask32*) &load_mask_value); - // Prepare X with 2-step interleave way xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); BF16_INTERLEAVE_1x32(xArray) @@ -2364,7 +2393,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x { BLASLONG tag_m_16x = m & (~15); - __m256i x256 = _mm256_loadu_si256(x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| + __m256i x256 = _mm256_loadu_si256((__m256i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| if (tag_m_16x > 0) { __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ @@ -2377,7 +2406,9 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); @@ -2484,7 +2515,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m128 accum128, tmp128; for (BLASLONG i = tag_m_16x; i < m; i++) { accum256 = _mm256_setzero_ps(); - matrixArray256 = _mm256_loadu_si256(&a[(i)*16]); // Load 1 rows with n=16 + matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(i)*16]); // Load 1 rows with n=16 accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); @@ -2535,7 +2566,9 @@ static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ @@ -2647,8 +2680,6 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b BLASLONG tag_n_32x = n & (~31); BLASLONG tag_n_128x = n & (~127); - __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ - accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; __m512 accum512_bridge[8]; __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3; __m256 accum256_0; @@ -2658,7 +2689,9 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; @@ -2825,7 +2858,9 @@ static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; @@ -2961,7 +2996,9 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 __m512 ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha)); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(beta)); +#endif #endif __m256 accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \ @@ -3012,7 +3049,7 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 __m128 accum128, tmp128; for (BLASLONG i = tag_m_8x; i < m; i++) { accum256_0 = _mm256_setzero_ps(); - matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]); // Load 1 rows with n=16 + matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(i)*lda]); // Load 1 rows with n=16 accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256); accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); From 2d33e12a119f0cf97e5c41ff4f6499e9229d9bd5 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Thu, 30 Sep 2021 03:14:15 -0400 Subject: [PATCH 137/143] Make sure that Netlib LAPACK respects FFLAGS OpenBLAS allows users to specify `FFLAGS` and then uses `override` to append additional options. However, without such an override in lapack's make.inc, lapack would use the external FFLAGS, rather than the ones being computed by OpenBLAS. For example the `DEBUG=1` flag would not apply to LAPACK code. This is all a bit messy but forced by the integration with netlib lapack. Note that `CFLAGS` already has this override for the same reason. It is possible that other variables here should have a similar override, but I think for most of the other ones, OpenBLAS's build system does not append to the flags passed in by the user. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 555d1c467..49fd57ff2 100644 --- a/Makefile +++ b/Makefile @@ -269,7 +269,7 @@ prof_lapack : lapack_prebuild lapack_prebuild : ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc - -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc From 2be5ee3cca97a597f2ee2118808a2d5eacea050c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Oct 2021 11:17:21 +0200 Subject: [PATCH 138/143] Fix out of bounds read in ?llarv (Reference-LAPACK PR 625) --- lapack-netlib/SRC/clarrv.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/clarrv.f b/lapack-netlib/SRC/clarrv.f index a45f55ac3..26a9febc8 100644 --- a/lapack-netlib/SRC/clarrv.f +++ b/lapack-netlib/SRC/clarrv.f @@ -351,7 +351,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0) .OR. (M.LE.0) ) THEN RETURN END IF * From fe497efa0510466fd93578aaf9da1ad8ed4edbe7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Oct 2021 11:18:20 +0200 Subject: [PATCH 139/143] Fix out of bounds read in ?llarv (Reference-LAPACK PR 625) --- lapack-netlib/SRC/dlarrv.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/dlarrv.f b/lapack-netlib/SRC/dlarrv.f index 4a59a2bbf..a1c6e9c9d 100644 --- a/lapack-netlib/SRC/dlarrv.f +++ b/lapack-netlib/SRC/dlarrv.f @@ -353,7 +353,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0).OR.(M.LE.0) ) THEN RETURN END IF * From ddb0ff5353637bb5f5ad060c9620e334c143e3d7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Oct 2021 11:19:07 +0200 Subject: [PATCH 140/143] Fix out of bounds read in ?llarv (Reference-LAPACK PR 625) --- lapack-netlib/SRC/slarrv.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/slarrv.f b/lapack-netlib/SRC/slarrv.f index 04519fde8..9448b2fd9 100644 --- a/lapack-netlib/SRC/slarrv.f +++ b/lapack-netlib/SRC/slarrv.f @@ -353,7 +353,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0).OR.(M.LE.0) ) THEN RETURN END IF * From 337b65133df174796794871b3988cd03426e6d41 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Oct 2021 11:19:53 +0200 Subject: [PATCH 141/143] Fix out of bounds read in ?llarv (Reference-LAPACK PR 625) --- lapack-netlib/SRC/zlarrv.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/zlarrv.f b/lapack-netlib/SRC/zlarrv.f index 23976dbef..8d10e3c2e 100644 --- a/lapack-netlib/SRC/zlarrv.f +++ b/lapack-netlib/SRC/zlarrv.f @@ -351,7 +351,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0).OR.(M.LE.0) ) THEN RETURN END IF * From ad87d627487a2647ee782b3948ceeba8733bee68 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 2 Oct 2021 16:27:34 +0200 Subject: [PATCH 142/143] Update Alpine version --- azure-pipelines.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5d4a1ecd3..f9e79018b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -206,8 +206,9 @@ jobs: vmImage: 'ubuntu-latest' steps: - script: | - wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ - && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 + wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \ + && echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \ + || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 From 5a468ae87a44f4eee356d629d0826bed0a5a5f46 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 2 Oct 2021 19:25:58 +0200 Subject: [PATCH 143/143] Update Changelog for 0.3.18 (#3388) * Update Changelog for 0.3.18 --- Changelog.txt | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index ee0484e2b..59fe1d45e 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,47 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.18 + 02-Oct-2021 + +general: + - when the build-time number of preconfigured threads is exceeded + at runtime (typically by an external program calling BLAS functions + from a larger number of threads in parallel), OpenBLAS will now + allocate an auxiliary control structure for up to 512 additional + threads instead of aborting + - added support for Loongson's LoongArch64 cpu architecture + - fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON + - added support for building OpenBLAS as a CMAKE subproject + - added support for building for Windows/ARM64 targets with clang + - improved support for building with the IBM xlf compiler + - imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV) + - imported Reference-LAPACK PR 597 for testsuite compatibility with + LLVM's libomp + +x86_64: + - added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000) + - added optimized SBGEMM for Intel Cooper Lake + - reinstated the performance patch for AVX512 SGEMV_T with a proper fix + - added a workaround for a gcc11 tree-vectorizer bug that caused spurious + failures in the test programs for complex BLAS3 when compiling at -O3 + (the default for cmake "release" builds) + - added support for runtime cpu count detection under Haiku OS + - worked around a long-standing miscompilation issue of the Haswell DGEMV_T + kernel with gcc that could produce NaN output in some corner cases + +POWER: + - improved performance of DASUM on POWER10 + +ARMV8: + - fixed crashes (use of reserved register x18) on Apple M1 under OSX + - fixed building with gcc releases earlier than 5.1 + +MIPS: + - fixed building under BSD + +MIPS64: + - fixed building under BSD + ==================================================================== Version 0.3.17 15-Jul-2021