Merge pull request #58 from xianyi/develop

rebase
5 years ago · 2931feb575
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -183,4 +183,7 @@ In chronological order:
 * Rajalakshmi Srinivasaraghavan <https://github.com/RajalakshmiSR>
  * [2020-04-15] Half-precision GEMM for bfloat16
 * Marius Hillenbrand <https://github.com/mhillenibm>
  * [2020-05-12] Revise dynamic architecture detection for IBM z
  * [2020-05-12] Add new sgemm and strmm kernel for IBM z14
--- a/Makefile.system
+++ b/Makefile.system
@@ -563,8 +563,27 @@ DYNAMIC_CORE += EMAG8180
 endif
 ifeq ($(ARCH), zarch)
 DYNAMIC_CORE = Z13
 DYNAMIC_CORE = ZARCH_GENERIC
 # Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer
 GCC_GE_52 := $(subst 0,,$(shell expr `$(CC) -dumpversion` \>= "5.2"))
 ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release)
 RHEL_WITH_Z13 := $(subst 0,,$(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"))
 endif
 ifeq ($(or $(GCC_GE_52),$(RHEL_WITH_Z13)), 1)
 DYNAMIC_CORE += Z13
 else
 $(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x)
 endif
 GCC_MAJOR_GE_7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
 ifeq ($(GCC_MAJOR_GE_7), 1)
 DYNAMIC_CORE += Z14
 else
 $(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x)
 endif
 endif
 ifeq ($(ARCH), power)
@@ -855,7 +874,7 @@ ifneq ($(INTERFACE64), 0)
 FCOMMON_OPT += -i8
 endif
 endif
 FCOMMON_OPT += -recursive
 FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens
 ifeq ($(USE_OPENMP), 1)
 FCOMMON_OPT += -fopenmp
 endif
--- a/Makefile.zarch
+++ b/Makefile.zarch
@@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector
 endif
 ifeq ($(CORE), Z14)
 CCOMMON_OPT += -march=z14 -mzvector
 CCOMMON_OPT += -march=z14 -mzvector -O3
 FCOMMON_OPT += -march=z14 -mzvector
 endif
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -49,6 +49,12 @@ else
 GOTO_LAPACK_TARGETS=
 endif
 ifeq ($(BUILD_HALF),1)
 GOTO_HALF_TARGETS=shgemm.goto
 else
 GOTO_HALF_TARGETS=
 endif
 ifeq ($(OSNAME), WINNT)
 goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
@@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
       sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
       spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
       ssymm.goto dsymm.goto csymm.goto zsymm.goto \
       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto
       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS)
 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
       scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
       samin.goto damin.goto camin.goto zamin.goto \
       smin.goto dmin.goto \
       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
       snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS)
       snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)
 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
       scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 ##################################### Sgemm ####################################################
 ifeq ($(BUILD_HALF),1)
 shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 endif
 sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c
 zcholesky.$(SUFFIX) : cholesky.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 ifeq ($(BUILD_HALF),1)
 shgemm.$(SUFFIX) : gemm.c
 	$(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^
 endif
 sgemm.$(SUFFIX) : gemm.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
--- a/benchmark/gemm.c
+++ b/benchmark/gemm.c
@@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifdef DOUBLE
 #define GEMM   BLASFUNC(dgemm)
 #elif defined(HALF)
 #define GEMM   BLASFUNC(shgemm)
 #else
 #define GEMM   BLASFUNC(sgemm)
 #endif
@@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){
 int main(int argc, char *argv[]){
  FLOAT *a, *b, *c;
  IFLOAT *a, *b;
  FLOAT *c;
  FLOAT alpha[] = {1.0, 0.0};
  FLOAT beta [] = {0.0, 0.0};
  char transa = 'N';
@@ -184,10 +187,10 @@ int main(int argc, char *argv[]){
    k = to;
  }
  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) {
  if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) {
    fprintf(stderr,"Out of Memory!!\n");exit(1);
  }
  if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) {
  if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) {
    fprintf(stderr,"Out of Memory!!\n");exit(1);
  }
  if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) {
@@ -199,10 +202,10 @@ int main(int argc, char *argv[]){
 #endif
  for (i = 0; i < m * k * COMPSIZE; i++) {
    a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
  }
  for (i = 0; i < k * n * COMPSIZE; i++) {
    b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
  }
  for (i = 0; i < m * n * COMPSIZE; i++) {
    c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -43,7 +43,8 @@ macro(ParseMakefileVars MAKEFILE_IN)
    if (NOT "${line_match}" STREQUAL "")
 #message(STATUS "match on ${line_match}")
      set(var_name ${CMAKE_MATCH_1})
      set(var_value ${CMAKE_MATCH_2})
 #      set(var_value ${CMAKE_MATCH_2})
      string(STRIP ${CMAKE_MATCH_2} var_value)
      # check for Makefile variables in the string, e.g. $(TSUFFIX)
      string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
      foreach (make_var ${make_var_matches})
@@ -63,7 +64,7 @@ macro(ParseMakefileVars MAKEFILE_IN)
        string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
        if (NOT "${line_match}" STREQUAL "")
 #          message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
          if (${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
          if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
 #            message (STATUS "condition is true")
            set (IfElse 1)
          else ()
--- a/driver/others/dynamic_zarch.c
+++ b/driver/others/dynamic_zarch.c
@@ -1,12 +1,58 @@
 #include "common.h"
 #include <stdbool.h>
 // Gate kernels for z13 and z14 on gcc version
 #if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 ||           \
    /* RHEL 7 since 7.3: */                                              \
    (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \
     __GNUC_RH_RELEASE__ >= 11)
 #define HAVE_Z13_SUPPORT
 #endif
 #if __GNUC__ >= 7
 #define HAVE_Z14_SUPPORT
 #endif
 // Guard the use of getauxval() on glibc version >= 2.16
 #ifdef __GLIBC__
 #include <features.h>
 #if __GLIBC_PREREQ(2, 16)
 #include <sys/auxv.h>
 #define HAVE_GETAUXVAL 1
 static unsigned long get_hwcap(void)
 {
 	unsigned long hwcap = getauxval(AT_HWCAP);
 	char *maskenv;
 	// honor requests for not using specific CPU features in LD_HWCAP_MASK
 	maskenv = getenv("LD_HWCAP_MASK");
 	if (maskenv)
 		hwcap &= strtoul(maskenv, NULL, 0);
 	return hwcap;
 	// note that a missing auxval is interpreted as no capabilities
 	// available, which is safe.
 }
 #else // __GLIBC_PREREQ(2, 16)
 #warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
 static unsigned long get_hwcap(void) {
 	// treat missing support for getauxval() as no capabilities available,
 	// which is safe.
 	return 0;
 }
 #endif // __GLIBC_PREREQ(2, 16)
 #endif // __GLIBC
 extern gotoblas_t gotoblas_ZARCH_GENERIC;
 #ifdef HAVE_Z13_SUPPORT
 extern gotoblas_t gotoblas_Z13;
 #endif
 #ifdef HAVE_Z14_SUPPORT
 extern gotoblas_t gotoblas_Z14;
 //extern gotoblas_t gotoblas_Z15;
 //#if (!defined C_GCC) || (GCC_VERSION >= 60000)
 //extern gotoblas_t gotoblas_Z14;
 //#endif
 #endif
 #define NUM_CORETYPES 4
@@ -16,47 +62,50 @@ static char* corename[] = {
 	"unknown",
 	"Z13",
 	"Z14",
 //	"Z15",
 	"ZARCH_GENERIC",
 };
 char* gotoblas_corename(void) {
 #ifdef HAVE_Z13_SUPPORT
 	if (gotoblas == &gotoblas_Z13)	return corename[1];
 #endif
 #ifdef HAVE_Z14_SUPPORT
 	if (gotoblas == &gotoblas_Z14)	return corename[2];
 //	if (gotoblas == &gotoblas_Z15)	return corename[3];
 //#if (!defined C_GCC) || (GCC_VERSION >= 60000)
 //	if (gotoblas == &gotoblas_POWER9)	return corename[3];
 //#endif
 	return corename[0]; // try generic?
 #endif
 	if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3];
 	return corename[0];
 }
 // __builtin_cpu_is is not supported by zarch
 /**
 * Detect the fitting set of kernels by retrieving the CPU features supported by
 * OS from the auxiliary value AT_HWCAP and choosing the set of kernels
 * ("coretype") that exploits most of the features and can be compiled with the
 * available gcc version.
 * Note that we cannot use vector registers on a z13 or newer unless supported
 * by the OS kernel (which needs to handle them properly during context switch).
 */
 static gotoblas_t* get_coretype(void) {
 	FILE* infile;
 	char buffer[512], * p;
 	p = (char*)NULL;
 	infile = fopen("/proc/sysinfo", "r");
 	while (fgets(buffer, sizeof(buffer), infile)) {
 		if (!strncmp("Type", buffer, 4)) {
 			p = strchr(buffer, ':') + 2;
 #if 0
 			fprintf(stderr, "%s\n", p);
 #endif
 			break;
 		}
 	}
 	fclose(infile);
 	unsigned long hwcap __attribute__((unused)) = get_hwcap();
 	if (strstr(p, "2964")) return &gotoblas_Z13;
 	if (strstr(p, "2965")) return &gotoblas_Z13;
 	if (strstr(p, "3906")) return &gotoblas_Z14;
 	if (strstr(p, "3907")) return &gotoblas_Z14;
 	if (strstr(p, "8561")) return &gotoblas_Z14;        // fallback z15 to z14
 	if (strstr(p, "8562")) return &gotoblas_Z14;        // fallback z15 to z14
 	// z14 and z15 systems: exploit Vector Facility (SIMD) and
 	// Vector-Enhancements Facility 1 (float SIMD instructions), if present.
 #ifdef HAVE_Z14_SUPPORT
 	if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
 		return &gotoblas_Z14;
 #endif
 	// z13: Vector Facility (SIMD for double)
 #ifdef HAVE_Z13_SUPPORT
 	if (hwcap & HWCAP_S390_VX)
 		return &gotoblas_Z13;
 #endif
 	return NULL; // should be ZARCH_GENERIC
 	// fallback in case of missing compiler support, systems before z13, or
 	// when the OS does not advertise support for the Vector Facility (e.g.,
 	// missing support in the OS kernel)
 	return &gotoblas_ZARCH_GENERIC;
 }
 static gotoblas_t* force_coretype(char* coretype) {
@@ -76,12 +125,13 @@ static gotoblas_t* force_coretype(char* coretype) {
 	switch (found)
 	{
 #ifdef HAVE_Z13_SUPPORT
 	case  1: return (&gotoblas_Z13);
 #endif
 #ifdef HAVE_Z14_SUPPORT
 	case  2: return (&gotoblas_Z14);
 //	case  3: return (&gotoblas_Z15);
 //#if (!defined C_GCC) || (GCC_VERSION >= 60000)
 //	case  3: return (&gotoblas_POWER9);
 //#endif
 #endif
 	case  3: return (&gotoblas_ZARCH_GENERIC);
 	default: return NULL;
 	}
 	snprintf(message, 128, "Core not found: %s\n", coretype);
@@ -109,9 +159,9 @@ void gotoblas_dynamic_init(void) {
 	if (gotoblas == NULL)
 	{
 		snprintf(coremsg, 128, "Falling back to Z14 core\n");
 		snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n");
 		openblas_warning(1, coremsg);
 		gotoblas = &gotoblas_Z14;
 		gotoblas = &gotoblas_ZARCH_GENERIC;
 	}
 	if (gotoblas && gotoblas->init) {
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -24,3 +24,6 @@ DGEMM_BETA = dgemm_beta_skylakex.c
 CGEMMKERNEL    =  cgemm_kernel_8x2_skylakex.c
 ZGEMMKERNEL    =  zgemm_kernel_4x2_skylakex.c
 CSCALKERNEL    = ../arm/zscal.c
 ZSCALKERNEL    = ../arm/zscal.c
--- a/kernel/zarch/KERNEL.Z14
+++ b/kernel/zarch/KERNEL.Z14
@@ -86,23 +86,23 @@ DGEMVTKERNEL = dgemv_t_4.c
 CGEMVTKERNEL = cgemv_t_4.c
 ZGEMVTKERNEL = zgemv_t_4.c
 STRMMKERNEL	= strmm8x4V.S
 STRMMKERNEL	= gemm_vec.c
 DTRMMKERNEL	= trmm8x4V.S
 CTRMMKERNEL	= ctrmm4x4V.S
 ZTRMMKERNEL	= ztrmm4x4V.S
 SGEMMKERNEL    =  strmm8x4V.S
 SGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 SGEMMITCOPY    = ../generic/gemm_tcopy_8.c
 SGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
 SGEMMKERNEL    = gemm_vec.c
 ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N))
 SGEMMINCOPY    = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
 SGEMMITCOPY    = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
 SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 SGEMMONCOPY    = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
 SGEMMOTCOPY    = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
 SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 DGEMMKERNEL    =  gemm8x4V.S
 DGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 DGEMMITCOPY    = ../generic/gemm_tcopy_8.c
@@ -145,7 +145,3 @@ ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
--- a/kernel/zarch/gemm_vec.c
+++ b/kernel/zarch/gemm_vec.c
@@ -0,0 +1,472 @@
 /*
 * Copyright (c) IBM Corporation 2020.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *    1. Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *
 *    2. Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *    3. Neither the name of the OpenBLAS project nor the names of
 *       its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include "common.h"
 #include <vecintrin.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #ifdef COMPLEX
 #error "Handling for complex numbers is not supported in this kernel"
 #endif
 #ifdef DOUBLE
 #define UNROLL_M DGEMM_DEFAULT_UNROLL_M
 #define UNROLL_N DGEMM_DEFAULT_UNROLL_N
 #else
 #define UNROLL_M SGEMM_DEFAULT_UNROLL_M
 #define UNROLL_N SGEMM_DEFAULT_UNROLL_N
 #endif
 static const size_t unroll_m = UNROLL_M;
 static const size_t unroll_n = UNROLL_N;
 /* Handling of triangular matrices */
 #ifdef TRMMKERNEL
 static const bool trmm = true;
 static const bool left =
 #ifdef LEFT
 	true;
 #else
 	false;
 #endif
 static const bool backwards =
 #if defined(LEFT) != defined(TRANSA)
 	true;
 #else
 	false;
 #endif
 #else
 static const bool trmm = false;
 static const bool left = false;
 static const bool backwards = false;
 #endif /* TRMMKERNEL */
 /*
 * Background:
 *
 * The algorithm of GotoBLAS / OpenBLAS breaks down the matrix multiplication
 * problem by splitting all matrices into partitions multiple times, so that the
 * submatrices fit into the L1 or L2 caches. As a result, each multiplication of
 * submatrices can stream data fast from L1 and L2 caches. Inbetween, it copies
 * and rearranges the submatrices to enable contiguous memory accesses to
 * improve locality in both caches and TLBs.
 *
 * At the heart of the algorithm is this kernel, which multiplies, a "Block
 * matrix" A (small dimensions) with a "Panel matrix" B (number of rows is
 * small) and adds the result into a "Panel matrix" C; GotoBLAS calls this
 * operation GEBP. This kernel further partitions GEBP twice, such that (1)
 * submatrices of C and B fit into the L1 caches (GEBP_column_block) and (2) a
 * block of C fits into the registers, while multiplying panels from A and B
 * streamed from the L2 and L1 cache, respectively (GEBP_block).
 *
 *
 * Algorithm GEBP(A, B, C, m, n, k, alpha):
 *
 * The problem is calculating C += alpha * (A * B)
 * C is an m x n matrix, A is an m x k matrix, B is an k x n matrix.
 *
 * - C is in column-major-order, with an offset of ldc to the element in the
 *   next column (same row).
 * - A is in row-major-order yet stores SGEMM_UNROLL_M elements of each column
 *   contiguously while walking along rows.
 * - B is in column-major-order but packs SGEMM_UNROLL_N elements of a row
 *   contiguously.
 * If the numbers of rows and columns are not multiples of SGEMM_UNROLL_M or
 * SGEMM_UNROLL_N, the remaining elements are arranged in blocks with power-of-2
 * dimensions (e.g., 5 remaining columns would be in a block-of-4 and a
 * block-of-1).
 *
 * Note that packing A and B into that form is taken care of by the caller in
 * driver/level3/level3.c (actually done by "copy kernels").
 *
 * Steps:
 * - Partition C and B into blocks of n_r (SGEMM_UNROLL_N) columns, C_j and B_j.
 *   Now, B_j should fit into the L1 cache.
 * - For each partition, calculate C_j += alpha * (A * B_j) by
 *     (1) Calculate C_aux := A * B_j (see below)
 *     (2) unpack C_j = C_j + alpha * C_aux
 *
 *
 * Algorithm for Calculating C_aux:
 *
 * - Further partition C_aux and A into groups of m_r (SGEMM_UNROLL_M) rows,
 *   such that the m_r x n_r-submatrix of C_aux can be held in registers. Each
 *   submatrix of C_aux can be calculated independently, and the registers are
 *   added back into C_j.
 *
 * - For each row-block of C_aux:
 *   (uses a row block of A and full B_j)
 *    - stream over all columns of A, multiply with elements from B and
 *      accumulate in registers. (use different inner-kernels to exploit
 *      vectorization for varying block sizes)
 *    - add alpha * row block of C_aux back into C_j.
 *
 * Note that there are additional mechanics for handling triangular matrices,
 * calculating B := alpha (A * B) where either of the matrices A or B can be
 * triangular. In case of A, the macro "LEFT" is defined. In addition, A can
 * optionally be transposed.
 * The code effectively skips an "offset" number of columns in A and rows of B
 * in each block, to save unnecessary work by exploiting the triangular nature.
 * To handle all cases, the code discerns (1) a "left" mode when A is triangular
 * and (2) "forward" / "backwards" modes where only the first "offset"
 * columns/rows of A/B are used or where the first "offset" columns/rows are
 * skipped, respectively.
 *
 * Reference:
 *
 * The summary above is based on staring at various kernel implementations and:
 * K. Goto and R. A. Van de Geijn, Anatomy of High-Performance Matrix
 * Multiplication, in ACM Transactions of Mathematical Software, Vol.  34, No.
 * 3, May 2008.
 */
 #define VLEN_BYTES 16
 #define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT))
 typedef FLOAT vector_float __attribute__ ((vector_size (16)));
 /**
 * Load a vector into register, and hint on 8-byte alignment to improve
 * performance. gcc-9 and newer will create these hints by itself. For older
 * compiler versions, use inline assembly to explicitly express the hint.
 * Provide explicit hex encoding to cater for binutils versions that do not know
 * about vector-load with alignment hints yet.
 *
 * Note that, for block sizes where we apply vectorization, vectors in A will
 * always be 8-byte aligned.
 */
 static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
 	vector_float const *restrict addr = (vector_float const *restrict)a;
 	vector_float y;
 #if __GNUC__ < 9
 	// hex-encode vl %[out],%[addr],3
 	asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
 	    : [ out ] "=v"(y)
 	    : [ addr ] "R"(*addr));
 #else
 	y = *addr;
 #endif
 	return y;
 }
 /**
 * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
 *
 * @param[in] 	A	Pointer current block of input matrix A.
 * @param[in]	k	Number of columns in A.
 * @param[in]	B	Pointer current block of input matrix B.
 * @param[inout] C	Pointer current block of output matrix C.
 * @param[in]	ldc	Offset between elements in adjacent columns in C.
 * @param[in]	alpha	Scalar factor.
 */
 #define VECTOR_BLOCK(ROWS, COLS)                                              \
 	static inline void GEBP_block_##ROWS##_##COLS(                        \
 	    FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B,    \
 	    FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {                   \
 		_Static_assert(                                               \
 		    ROWS % VLEN_FLOATS == 0,                                  \
 		    "rows in block must be multiples of vector length");      \
 		vector_float Caux[ROWS / VLEN_FLOATS][COLS];                  \
                                                                              \
 		for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++)             \
 			for (BLASLONG j = 0; j < COLS; j++)                   \
 				Caux[i][j] = vec_splats(ZERO);                \
                                                                              \
 		/*                                                            \
 		 * Stream over the row-block of A, which is packed            \
 		 * column-by-column, multiply by coefficients in B and add up \
 		 * into temporaries Caux (which the compiler will hold in     \
 		 * registers). Vectorization: Multiply column vectors from A  \
 		 * with scalars from B and add up in column vectors of Caux.  \
 		 * That equates to unrolling the loop over rows (in i) and    \
 		 * executing each unrolled iteration as a vector element.     \
 		 */                                                           \
 		for (BLASLONG k = 0; k < bk; k++) {                           \
 			for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {   \
 				vector_float Ak = vec_load_hinted(            \
 				    A + i * VLEN_FLOATS + k * ROWS);          \
                                                                              \
 				for (BLASLONG j = 0; j < COLS; j++)           \
 					Caux[i][j] += Ak * B[j + k * COLS];   \
 			}                                                     \
 		}                                                             \
                                                                              \
 		/*                                                            \
 		 * Unpack row-block of C_aux into outer C_i, multiply by      \
 		 * alpha and add up.                                          \
 		 */                                                           \
 		for (BLASLONG j = 0; j < COLS; j++) {                         \
 			for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {   \
 				vector_float *C_ij =                          \
 				    (vector_float *)(C + i * VLEN_FLOATS +    \
 						     j * ldc);                \
 				if (trmm) {                                   \
 					*C_ij = alpha * Caux[i][j];           \
 				} else {                                      \
 					*C_ij += alpha * Caux[i][j];          \
 				}                                             \
 			}                                                     \
 		}                                                             \
 	}
 #if UNROLL_M == 16
 VECTOR_BLOCK(16, 4)
 VECTOR_BLOCK(16, 2)
 VECTOR_BLOCK(16, 1)
 #endif
 #if UNROLL_N == 8
 VECTOR_BLOCK(8, 8)
 VECTOR_BLOCK(4, 8)
 #endif
 VECTOR_BLOCK(8, 4)
 VECTOR_BLOCK(8, 2)
 VECTOR_BLOCK(8, 1)
 VECTOR_BLOCK(4, 4)
 VECTOR_BLOCK(4, 2)
 VECTOR_BLOCK(4, 1)
 #ifdef DOUBLE
 VECTOR_BLOCK(2, 4)
 VECTOR_BLOCK(2, 2)
 #endif
 /**
 * Handle calculation for row blocks in C_i of any size by dispatching into
 * macro-defined (inline) functions or by deferring to a simple generic
 * implementation. Note that the compiler can remove this awkward-looking
 * dispatching code while inlineing.
 *
 * @param[in]	m	Number of rows in block C_i.
 * @param[in]	n	Number of columns in block C_i.
 * @param[in]	first_row Index of first row of the block C_i (relative to C).
 * @param[in]	A	Pointer to input matrix A (note: all of it).
 * @param[in]	k	Number of columns in A and rows in B.
 * @param[in]	B	Pointer to current column block (panel) of input matrix B.
 * @param[inout] C	Pointer to current column block (panel) of output matrix C.
 * @param[in]	ldc	Offset between elements in adjacent columns in C.
 * @param[in]	alpha	Scalar factor.
 * @param[in]	offset  Number of columns of A and rows of B to skip (for triangular matrices).
 * @param[in]	off	Running offset for handling triangular matrices.
 */
 static inline void GEBP_block(BLASLONG m, BLASLONG n,
 		       BLASLONG first_row,
 		       const FLOAT * restrict A, BLASLONG k,
 		       const FLOAT * restrict B,
 		       FLOAT *restrict C, BLASLONG ldc,
 		       FLOAT alpha,
 		       BLASLONG offset, BLASLONG off)
 {
 	if (trmm && left)
 		off = offset + first_row;
 	A += first_row * k;
 	C += first_row;
 	if (trmm) {
 		if (backwards) {
 			A += off * m;
 			B += off * n;
 			k -= off;
 		} else {
 			if (left) {
 				k = off + m;
 			} else {
 				k = off + n;
 			}
 		}
 	}
 #define BLOCK(bm, bn)                                           \
 	if (m == bm && n == bn) {                               \
 		GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
 		return;                                         \
 	}
 #if UNROLL_M == 16
 	BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1);
 #endif
 #if UNROLL_N == 8
 	BLOCK(8, 8); BLOCK(4, 8);
 #endif
 	BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
 	BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
 	#ifdef DOUBLE
 	BLOCK(2, 4);
 	BLOCK(2, 2);
 	#endif
 #undef BLOCK
 	/* simple implementation for smaller block sizes: */
 	FLOAT Caux[m][n] __attribute__ ((aligned (16)));
 	/*
 	 * Peel off first iteration (i.e., column of A) for initializing Caux
 	 */
 	for (BLASLONG i = 0; i < m; i++)
 		for (BLASLONG j = 0; j < n; j++)
 			Caux[i][j] = A[i] * B[j];
 	for (BLASLONG kk = 1; kk < k; kk++)
 		for (BLASLONG i = 0; i < m; i++)
 			for (BLASLONG j = 0; j < n; j++)
 				Caux[i][j] += A[i + kk * m] * B[j + kk * n];
 	for (BLASLONG i = 0; i < m; i++)
 		for (BLASLONG j = 0; j < n; j++)
 			if (trmm) {
 				C[i + j * ldc] = alpha * Caux[i][j];
 			} else {
 				C[i + j * ldc] += alpha * Caux[i][j];
 			}
 }
 /**
 * Handle a column block (panel) of C and B while calculating C += alpha(A * B).
 *
 * @param[in]	num_cols	Number of columns in the block (in C and B).
 * @param[in]	first_col	First column of the current block (in C and B).
 * @param[in]	A	Pointer to input matrix A.
 * @param[in]	bk	Number of columns in A and rows in B.
 * @param[in]	B	Pointer to input matrix B (note: all of it).
 * @param[in]	bm	Number of rows in C and A.
 * @param[inout] C	Pointer to output matrix C (note: all of it).
 * @param[in]	ldc	Offset between elements in adjacent columns in C.
 * @param[in]	alpha	Scalar factor.
 * @param[in]	offset	Number of columns of A and rows of B to skip (for triangular matrices).
 */
 static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
 			const FLOAT *restrict A, BLASLONG bk,
 			const FLOAT *restrict B, BLASLONG bm,
 			FLOAT *restrict C, BLASLONG ldc,
 			FLOAT alpha,
 			BLASLONG const offset) {
 	FLOAT *restrict C_i = C + first_col * ldc;
 	/*
 	 * B is in column-order with n_r packed row elements, which does
 	 * not matter -- we always move in full such blocks of
 	 * column*pack
 	 */
 	const FLOAT *restrict B_i = B + first_col * bk;
 	BLASLONG off = 0;
 	if (trmm) {
 		if (left) {
 			off = offset;
 		} else {
 			off = -offset + first_col;
 		}
 	}
 	/*
 	 * Calculate C_aux := A * B_j
 	 * then unpack C_i += alpha * C_aux.
 	 *
 	 * For that purpose, further partition C_aux and A into blocks
 	 * of m_r (unroll_m) rows, or powers-of-2 if smaller.
 	 */
 	BLASLONG row = 0;
 	for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2)
 		for (; bm - row >= block_size; row += block_size)
 			GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i,
 				   ldc, alpha, offset, off);
 }
 /**
 * Inner kernel for matrix-matrix multiplication. C += alpha (A * B)
 * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and
 * C are pointers to submatrices of the actual matrices.
 *
 * For triangular matrix multiplication, calculate B := alpha (A * B) where A
 * or B can be triangular (in case of A, the macro LEFT will be defined).
 *
 * @param[in]	bm	Number of rows in C and A.
 * @param[in]	bn	Number of columns in C and B.
 * @param[in]	bk	Number of columns in A and rows in B.
 * @param[in]	alpha	Scalar factor.
 * @param[in]	ba	Pointer to input matrix A.
 * @param[in]	bb	Pointer to input matrix B.
 * @param[inout] C	Pointer to output matrix C.
 * @param[in]	ldc	Offset between elements in adjacent columns in C.
 * @param[in]	offset	Number of columns of A and rows of B to skip (for triangular matrices).
 * @returns 0 on success.
 */
 int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
 	  FLOAT *restrict ba, FLOAT *restrict bb,
 	  FLOAT *restrict C, BLASLONG ldc
 #ifdef TRMMKERNEL
 	  , BLASLONG offset
 #endif
 	  )
 {
 	if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO))
 		return 0;
 	/*
 	 * interface code allocates buffers for ba and bb at page
 	 * granularity (i.e., using mmap(MAP_ANONYMOUS), so enable the compiler
 	 * to make use of the fact in vector load operations.
 	 */
 	ba = __builtin_assume_aligned(ba, 16);
 	bb = __builtin_assume_aligned(bb, 16);
 	/*
 	 * Use offset and off even when compiled as SGEMMKERNEL to simplify
 	 * function signatures and function calls.
 	 */
 #ifndef TRMMKERNEL
 	BLASLONG const offset = 0;
 #endif
 	/*
 	 * Partition B and C into blocks of n_r (unroll_n) columns, called B_i
 	 * and C_i. For each partition, calculate C_i += alpha * (A * B_j).
 	 *
 	 * For remaining columns that do not fill up a block of n_r, iteratively
 	 * use smaller block sizes of powers of 2.
 	 */
 	BLASLONG col = 0;
 	for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2)
 		for (; bn - col >= block_size; col += block_size)
 			GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset);
   return 0;
 }
--- a/param.h
+++ b/param.h
@@ -2999,7 +2999,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 #define SGEMM_DEFAULT_UNROLL_M  8
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
 #define DGEMM_DEFAULT_UNROLL_M  8
--- a/test/compare_sgemm_shgemm.c
+++ b/test/compare_sgemm_shgemm.c
@@ -46,6 +46,27 @@ typedef union
  } bits;
 } bfloat16_bits;
 typedef union
 {
  float v;
  struct
  {
    uint32_t m:23;
    uint32_t e:8;
    uint32_t s:1;
  } bits;
 } float32_bits;
 float
 float16to32 (bfloat16_bits f16)
 {
  float32_bits f32;
  f32.bits.s = f16.bits.s;
  f32.bits.e = f16.bits.e;
  f32.bits.m = (uint32_t) f16.bits.m << 16;
  return f32.v;
 }
 int
 main (int argc, char *argv[])
 {
@@ -55,8 +76,6 @@ main (int argc, char *argv[])
  int loop = 100;
  char transA = 'N', transB = 'N';
  float alpha = 1.0, beta = 0.0;
  char transa = 'N';
  char transb = 'N';
  for (int x = 0; x <= loop; x++)
    {
@@ -65,30 +84,45 @@ main (int argc, char *argv[])
      float B[k * n];
      float C[m * n];
      bfloat16_bits AA[m * k], BB[k * n];
      float CC[m * n];
      float DD[m * n], CC[m * n];
      for (int j = 0; j < m; j++)
 	{
 	  for (int i = 0; i < m; i++)
 	    {
 	      A[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5;
 	      B[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5;
 	      A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
 	      B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
 	      C[j * k + i] = 0;
 	      AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16;
 	      BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16;
 	      CC[j * k + i] = 0;
 	      DD[j * k + i] = 0;
 	    }
 	}
      SGEMM (&transA, &transB, &m, &n, &k, &alpha, A,
 	      &m, B, &k, &beta, C, &m);
 	     &m, B, &k, &beta, C, &m);
      SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA,
 	       &m, BB, &k, &beta, CC, &m);
 	      &m, BB, &k, &beta, CC, &m);
      for (i = 0; i < n; i++)
        for (j = 0; j < m; j++)
          for (l = 0; l < k; l++)
            if (fabs(CC[i * m + j]-C[i * m + j]) > 1.0)
              ret++;
 	for (j = 0; j < m; j++)
 	  for (l = 0; l < k; l++)
 	    if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0)
 	      ret++;
      if (transA == 'N' && transB == 'N')
 	{
 	  for (i = 0; i < n; i++)
 	    for (j = 0; j < m; j++)
 	      for (l = 0; l < k; l++)
 		{
 		  DD[i * m + j] +=
 		    float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]);
 		}
 	  for (i = 0; i < n; i++)
 	    for (j = 0; j < m; j++)
 	      for (l = 0; l < k; l++)
 		if (CC[i * m + j] != DD[i * m + j])
 		  ret++;
 	}
    }
  if (ret != 0)
    fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret);