Merge branch 'develop' into betterPowerGEMVTail

1 year ago · 75472b830a
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)

 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
 set(OpenBLAS_PATCH_VERSION 27.dev)
 set(OpenBLAS_PATCH_VERSION 28.dev)

 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

@@ -22,6 +22,8 @@ option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS

 option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)

 set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)")

 option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)

 option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
@@ -30,7 +32,7 @@ option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OF

 option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)

 option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
 option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64, ppc or RISCV64-RVV1.0 only)" OFF)

 option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)

@@ -256,6 +258,10 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
  endif()
 endif()

 if (APPLE AND BUILD_SHARED_LIBS)
 set(CMAKE_MACOSX_RPATH ON)
 endif()

 # Seems that this hack doesn't required since macOS 11 Big Sur
 if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
  set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,127 @@
 OpenBLAS ChangeLog
 ====================================================================
 Version 0.3.28
 8-Aug-2024

 general:
 - Reworked the unfinished implementation of HUGETLB from GotoBLAS
  for allocating huge memory pages as buffers on suitable systems
 - Changed the unfinished implementation of GEMM3M for the generic
  target on all architectures to at least forward to regular GEMM
 - Improved multithreaded GEMM performance for large non-skinny matrices
 - Improved BLAS3 performance on larger multicore systems through improved
  parallelism
 - Improved performance of the initial memory allocation by reducing
  locking overhead
 - Improved performance of GBMV at small problem sizes by introducing
  a size barrier for the switch to multithreading
 - Added an implementation of the CBLAS_GEMM_BATCH extension
 - Fixed miscompilation of CAXPYC and ZAXPYC on all architectures in 
  CMAKE builds (error introduced in 0.3.27)
 - Fixed corner cases involving the handling of NAN and INFINITY
  arguments in ?SCAL on all architectures
 - Added support for cross-compiling to WEBM with CMAKE (in addition
  to the already present makefile support)
 - Fixed NAN handling and potential accuracy issues in compilations with
  Intel ICX by supplying a suitable fp-model option by default
 - The contents of the github project wiki have been converted into
  a new set of documentation included with the source code.
 - It is now possible to register a callback function that replaces
  the built-in support for multithreading with an external backend
  like TBB (openblas_set_threads_callback_function)
 - Fixed potential duplication of suffixes in shared library naming
 - Improved C compiler detection by the build system to tolerate more
  naming variants for gcc builds
 - Fixed an unnecessary dependency of the utest on CBLAS
 - Fixed spurious error reports from the BLAS extensions utest
 - Fixed unwanted invocation of the GEMM3M tests in cross-compilation
 - Fixed a flaw in the makefile build that could lead to the pkgconfig
  file containing an entry of UNKNOWN for the target cpu after installing
 - Integrated fixes from the Reference-LAPACK project:
  - Fixed uninitialized variables in the LAPACK tests for ?QP3RK (PR 961)
  - Fixed potential bounds error in ?UNHR_COL/?ORHR_COL (PR 1018)
  - Fixed potential infinite loop in the LAPACK testsuite (PR 1024)
  - Make the variable type used for hidden length arguments configurable (PR 1025)  
  - Fixed SYTRD workspace computation and various typos (PR 1030)
  - Prevent compiler use of FMA that could increase numerical error in ?GEEVX (PR 1033)

 x86-64:
 - reverted thread management under Windows to its state before 0.3.26
  due to signs of race conditions in some circumstances now under study
 - fixed accidental selection of the unoptimized generic SBGEMM kernel
  in CMAKE builds for CooperLake and SapphireRapids targets
 - fixed a potential thread buffer overrun in SBSTOBF16 on small systems
 - fixed an accuracy issue in ZSCAL introduced in 0.3.26
 - fixed compilation with CMAKE and recent releases of LLVM
 - added support for Intel Emerald Rapids and Meteor Lake cpus
 - added autodetection support for the Zhaoxin KX-7000 cpu
 - fixed autodetection of Intel Prescott (probably broken since 0.3.19)
 - fixed compilation for older targets with the Yocto SDK
 - fixed compilation of the converter-generated C versions
  of the LAPACK sources with gcc-14
 - improved compiler options when building with CMAKE and LLVM for
  AVX512-capable targets
 - added support for supplying the L2 cache size via an environment
  variable (OPENBLAS_L2_SIZE) in case it is not correctly reported
  (as in some VM configurations)
 - improved the error message shown when thread creation fails on startup
 - fixed setting the rpath entry of the dylib in CMAKE builds on MacOS

 arm:
 - fixed building for baremetal targets with make

 arm64:
 - Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
  matrix to the corresponding GEMV kernel 
 - added optimized SGEMV and DGEMV kernels for A64FX
 - added optimized SVE kernels for small-matrix GEMM
 - added A64FX to the cpu list for DYNAMIC_ARCH
 - fixed building with support for cpu affinity
 - worked around accuracy problems with C/ZNRM2 on NeoverseN1 and
  Apple M targets
 - improved GEMM performance on Neoverse V1
 - fixed compilation for NEOVERSEN2 with older compilers
 - fixed potential miscompilation of the SVE SDOT and DDOT kernels
 - fixed potential miscompilation of the non-SVE CDOT and ZDOT kernels
 - fixed a potential overflow when using very large user-defined BUFFERSIZE
 - fixed setting the rpath entry of the dylib in CMAKE builds on MacOS

 power:
 - Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
  matrix to the corresponding GEMV kernel 
 - significantly improved performance of SBGEMM on POWER10
 - fixed compilation with OpenMP and the XLF compiler
 - fixed building of the BLAS extension utests under AIX
 - fixed building of parts of the LAPACK testsuite with XLF
 - fixed CSWAP/ZSWAP on big-endian POWER10 targets
 - fixed a performance regression in SAXPY on POWER10 with OpenXL
 - fixed accuracy issues in CSCAL/ZSCAL when compiled with LLVM
 - fixed building for POWER9 under FreeBSD
 - fixed a potential overflow when using very large user-defined BUFFERSIZE
 - fixed an accuracy issue in the POWER6 kernels for GEMM and GEMV

 riscv64:
 - Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
  matrix to the corresponding GEMV kernel 
 - fixed building for RISCV64_GENERIC with OpenMP enabled
 - added DYNAMIC_ARCH support (comprising GENERIC_RISCV64 and the two
  RVV 1.0 targets with vector length of 128 and 256)
 - worked around the ZVL128B kernels for AXPBY mishandling the special
  case of zero Y increment

 loongarch64:
 - improved GEMM performance on servers of the 3C5000 generation
 - improved performance and stability of DGEMM
 - improved GEMV and TRSM kernels for LSX and LASX vector ABIs
 - fixed CMAKE compilation with the INTERFACE64 option set
 - fixed compilation with CMAKE
 - worked around spurious errors flagged by the BLAS3 tests
 - worked around a miscompilation of the POTRS utest by gcc 14.1

 mips64:
 - fixed ASUM and SUM kernels to accept negative step sizes in X
 - fixed complex GEMV kernels for MSA

 ====================================================================
 Version 0.3.27
 4-Apr-2024
--- a/+ 4
+++ b/+ 4
@@ -45,6 +45,10 @@ else
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
 endif

 ifdef LAPACK_STRLEN
 LAPACK_FFLAGS += -DLAPACK_STRLEN=$(LAPACK_STRLEN)
 endif

 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test

 .PHONY : all libs netlib $(RELA) test ctest shared install
--- a/Makefile.install
+++ b/Makefile.install
@@ -178,7 +178,7 @@ endif
 	@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
 	@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
 	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
 	@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
 	@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
 	@echo 'version='$(VERSION) >> "$(PKGFILE)"
 	@echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)"
 	@cat openblas.pc.in >> "$(PKGFILE)"
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #

 # This library's version
 VERSION = 0.3.27.dev
 VERSION = 0.3.28.dev

 # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
 # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
@@ -134,6 +134,12 @@ VERSION = 0.3.27.dev
 # Build LAPACK Deprecated functions since LAPACK 3.6.0
 BUILD_LAPACK_DEPRECATED = 1

 # The variable type assumed for the length of character arguments when passing
 # data between Fortran LAPACK and C BLAS (defaults to "size_t", but older GCC
 # versions used "int"). Mismatches will not cause runtime failures but may result
 # in build warnings or errors when building with link-time optimization (LTO)
 # LAPACK_STRLEN=int

 # Build RecursiveLAPACK on top of LAPACK
 # BUILD_RELAPACK = 1
 # Have RecursiveLAPACK actually replace standard LAPACK routines instead of 
--- a/Makefile.system
+++ b/Makefile.system
@@ -277,6 +277,12 @@ endif
 ifeq ($(ARCH), arm64)
 GEMM_GEMV_FORWARD = 1
 endif
 ifeq ($(ARCH), riscv)
 GEMM_GEMV_FORWARD = 1
 endif
 ifeq ($(ARCH), power)
 GEMM_GEMV_FORWARD = 1
 endif

 ifeq ($(SMALL_MATRIX_OPT), 1)
 CCOMMON_OPT += -DSMALL_MATRIX_OPT
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -57,7 +57,11 @@ if (DYNAMIC_ARCH)
 	  set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
 	  set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
  endif ()
  
 
  if (RISCV64)
 	  set(DYNAMIC_CORE RISCV64_GENERIC RISCV64_ZVL128B RISCV64_ZVL256B) 
  endif ()

  if (X86)
    set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
  endif ()
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -403,7 +403,7 @@ if (SMALL_MATRIX_OPT)
 endif ()

 if (DYNAMIC_ARCH)
  if (X86 OR X86_64 OR ARM64 OR POWER)
  if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64)
    set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
    if (DYNAMIC_OLDER)
      set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
@@ -621,7 +621,10 @@ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}")
 set(FPFLAGS "${FPFLAGS} ${FCOMMON_OPT} ${COMMON_PROF}")

 #For LAPACK Fortran codes.
 set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}")
 set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}" )
 if (LAPACK_STRLEN)
 	set (LAPACK_FFLAGS "${LAPACK_FFLAGS} -DLAPACK_STRLEN=${LAPACK_STRLEN}")
 endif()
 set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}")

 #Disable -fopenmp for LAPACK Fortran codes on Windows.
--- a/common_thread.h
+++ b/common_thread.h
@@ -111,8 +111,8 @@ typedef struct blas_queue {
  struct blas_queue *next;

 #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
  // CRITICAL_SECTION lock;
  // HANDLE finish;
   CRITICAL_SECTION lock;
   HANDLE finish;
  volatile int finished;
 #else
  pthread_mutex_t	 lock;
--- a/driver/others/CMakeLists.txt
+++ b/driver/others/CMakeLists.txt
@@ -52,6 +52,8 @@ if (DYNAMIC_ARCH)
    list(APPEND COMMON_SOURCES dynamic_arm64.c)
  elseif (POWER)
    list(APPEND COMMON_SOURCES dynamic_power.c)
  elseif (RISCV64)
    list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c)
  else ()  
    list(APPEND COMMON_SOURCES dynamic.c)
  endif ()  
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2769,7 +2769,7 @@ void *blas_memory_alloc(int procpos){
 #ifdef ALLOC_DEVICEDRIVER
    alloc_devicedirver,
 #endif
 #ifdef ALLOC_SHM && !defined(ALLOC_HUGETLB)
 #if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB)
    alloc_shm,
 #endif
 #if ((defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS))
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -498,7 +498,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 	 args.m, args.n, args.k, args.lda, args.ldb, args.ldc);
 #endif

 #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX)
 #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(BFLOAT16)
  // Check if we can convert GEMM -> GEMV
  if (args.k != 0) {
    if (args.n == 1) {
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -17,15 +17,6 @@ ifeq ($(ARCH), ia64)
 USE_GEMM3M = 1
 endif

 ifneq ($(DYNAMIC_ARCH), 1)
 ifeq ($(TARGET), GENERIC)
 USE_GEMM3M = 0
 endif
 else
 ifeq ($(CORE), GENERIC)
 USE_GEMM3M = 0
 endif
 endif

 ifeq ($(ARCH), arm)
 USE_TRMM = 1
--- a/kernel/generic/zgemm3mkernel_dump.c
+++ b/kernel/generic/zgemm3mkernel_dump.c
@@ -25,10 +25,16 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 #if 1

 #include "zgemmkernel_2x2.c"


 #else
 #include "common.h"

 int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alphar, FLOAT alphai, FLOAT * ba, FLOAT * bb, FLOAT * C, BLASLONG ldc)
 {
  return 0;
 }
 #endif
--- a/kernel/x86_64/dscal.c
+++ b/kernel/x86_64/dscal.c
@@ -242,4 +242,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
            }
        }
    }
    return(0);
 }
--- a/kernel/x86_64/sscal.c
+++ b/kernel/x86_64/sscal.c
@@ -200,4 +200,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
            }
        }
    }
    return(0);
 }
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
--- a/lapack-netlib/SRC/cgehrd.f
+++ b/lapack-netlib/SRC/cgehrd.f
@@ -163,7 +163,8 @@
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE CGEHRD( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO )
      SUBROUTINE CGEHRD( N, ILO, IHI, A, LDA, TAU, WORK, LWORK,
     $                   INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -193,7 +194,8 @@
      COMPLEX            EI
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           CAXPY, CGEHD2, CGEMM, CLAHR2, CLARFB, CTRMM,
      EXTERNAL           CAXPY, CGEHD2, CGEMM, CLAHR2, CLARFB,
     $                   CTRMM,
     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
@@ -230,7 +232,7 @@
         IF( NH.LE.1 ) THEN
            LWKOPT = 1
         ELSE
            NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI,
            NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI,
     $                              -1 ) )
            LWKOPT = N*NB + TSIZE
         END IF
--- a/lapack-netlib/SRC/chetrd.f
+++ b/lapack-netlib/SRC/chetrd.f
@@ -139,7 +139,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complexHEcomputational
 *> \ingroup hetrd
 *
 *> \par Further Details:
 *  =====================
@@ -188,7 +188,8 @@
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE CHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO )
      SUBROUTINE CHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK,
     $                   INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -225,7 +226,8 @@
 *     .. External Functions ..
      LOGICAL            LSAME
      INTEGER            ILAENV
      EXTERNAL           LSAME, ILAENV
      REAL               SROUNDUP_LWORK
      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -249,8 +251,8 @@
 *        Determine the block size.
 *
         NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
         LWKOPT = N*NB
         WORK( 1 ) = LWKOPT
         LWKOPT = MAX( 1, N*NB )
         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
      END IF
 *
      IF( INFO.NE.0 ) THEN
@@ -367,7 +369,7 @@
     $                TAU( I ), IINFO )
      END IF
 *
      WORK( 1 ) = LWKOPT
      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
      RETURN
 *
 *     End of CHETRD
--- a/lapack-netlib/SRC/dlanv2.f
+++ b/lapack-netlib/SRC/dlanv2.f
@@ -109,7 +109,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleOTHERauxiliary
 *> \ingroup lanv2
 *
 *> \par Further Details:
 *  =====================
@@ -144,7 +144,7 @@
 *     ..
 *     .. Local Scalars ..
      DOUBLE PRECISION   AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB,
     $                   SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, 
     $                   SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN,
     $                   SAFMN2, SAFMX2
      INTEGER            COUNT
 *     ..
@@ -248,10 +248,14 @@
 *
 *           Compute [ A  B ] = [ CS  SN ] [ AA  BB ]
 *                   [ C  D ]   [-SN  CS ] [ CC  DD ]
 *
 *           Note: Some of the multiplications are wrapped in parentheses to
 *                 prevent compilers from using FMA instructions. See
 *                 https://github.com/Reference-LAPACK/lapack/issues/1031.
 *
            A = AA*CS + CC*SN
            B = BB*CS + DD*SN
            C = -AA*SN + CC*CS
            B = ( BB*CS ) + ( DD*SN )
            C = -( AA*SN ) + ( CC*CS )
            D = -BB*SN + DD*CS
 *
            TEMP = HALF*( A+D )
--- a/lapack-netlib/SRC/sgelqt.f
+++ b/lapack-netlib/SRC/sgelqt.f
@@ -18,7 +18,7 @@
 *>
 *> \verbatim
 *>
 *> DGELQT computes a blocked LQ factorization of a real M-by-N matrix A
 *> SGELQT computes a blocked LQ factorization of a real M-by-N matrix A
 *> using the compact WY representation of Q.
 *> \endverbatim
 *
@@ -93,7 +93,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleGEcomputational
 *> \ingroup gelqt
 *
 *> \par Further Details:
 *  =====================
--- a/lapack-netlib/SRC/sgemlq.f
+++ b/lapack-netlib/SRC/sgemlq.f
@@ -74,7 +74,7 @@
 *>          A is REAL array, dimension
 *>                               (LDA,M) if SIDE = 'L',
 *>                               (LDA,N) if SIDE = 'R'
 *>          Part of the data structure to represent Q as returned by DGELQ.
 *>          Part of the data structure to represent Q as returned by SGELQ.
 *> \endverbatim
 *>
 *> \param[in] LDA
--- a/lapack-netlib/SRC/sgemlqt.f
+++ b/lapack-netlib/SRC/sgemlqt.f
@@ -20,7 +20,7 @@
 *>
 *> \verbatim
 *>
 *> DGEMLQT overwrites the general real M-by-N matrix C with
 *> SGEMLQT overwrites the general real M-by-N matrix C with
 *>
 *>                 SIDE = 'L'     SIDE = 'R'
 *> TRANS = 'N':      Q C            C Q
@@ -145,7 +145,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleGEcomputational
 *> \ingroup gemlqt
 *
 *  =====================================================================
      SUBROUTINE SGEMLQT( SIDE, TRANS, M, N, K, MB, V, LDV, T, LDT,
--- a/lapack-netlib/SRC/slanv2.f
+++ b/lapack-netlib/SRC/slanv2.f
@@ -109,7 +109,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup realOTHERauxiliary
 *> \ingroup lanv2
 *
 *> \par Further Details:
 *  =====================
@@ -144,7 +144,7 @@
 *     ..
 *     .. Local Scalars ..
      REAL               AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB,
     $                   SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, 
     $                   SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN,
     $                   SAFMN2, SAFMX2
      INTEGER            COUNT
 *     ..
@@ -248,10 +248,14 @@
 *
 *           Compute [ A  B ] = [ CS  SN ] [ AA  BB ]
 *                   [ C  D ]   [-SN  CS ] [ CC  DD ]
 *
 *           Note: Some of the multiplications are wrapped in parentheses to
 *                 prevent compilers from using FMA instructions. See
 *                 https://github.com/Reference-LAPACK/lapack/issues/1031.
 *
            A = AA*CS + CC*SN
            B = BB*CS + DD*SN
            C = -AA*SN + CC*CS
            B = ( BB*CS ) + ( DD*SN )
            C = -( AA*SN ) + ( CC*CS )
            D = -BB*SN + DD*CS
 *
            TEMP = HALF*( A+D )
--- a/lapack-netlib/SRC/ssytrd.f
+++ b/lapack-netlib/SRC/ssytrd.f
@@ -188,7 +188,8 @@
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE SSYTRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO )
      SUBROUTINE SSYTRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK,
     $                   INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -248,7 +249,7 @@
 *        Determine the block size.
 *
         NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
         LWKOPT = N*NB
         LWKOPT = MAX( 1, N*NB )
         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
      END IF
 *
@@ -316,7 +317,8 @@
 *           Update the unreduced submatrix A(1:i-1,1:i-1), using an
 *           update of the form:  A := A - V*W**T - W*V**T
 *
            CALL SSYR2K( UPLO, 'No transpose', I-1, NB, -ONE, A( 1, I ),
            CALL SSYR2K( UPLO, 'No transpose', I-1, NB, -ONE, A( 1,
     $                   I ),
     $                   LDA, WORK, LDWORK, ONE, A, LDA )
 *
 *           Copy superdiagonal elements back into A, and diagonal
--- a/lapack-netlib/SRC/zhetrd.f
+++ b/lapack-netlib/SRC/zhetrd.f
@@ -139,7 +139,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complex16HEcomputational
 *> \ingroup hetrd
 *
 *> \par Further Details:
 *  =====================
@@ -188,7 +188,8 @@
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE ZHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO )
      SUBROUTINE ZHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK,
     $                   INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -249,7 +250,7 @@
 *        Determine the block size.
 *
         NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 )
         LWKOPT = N*NB
         LWKOPT = MAX( 1, N*NB )
         WORK( 1 ) = LWKOPT
      END IF
 *
--- a/test/Makefile
+++ b/test/Makefile
@@ -189,8 +189,11 @@ endif
 endif


 ifeq ($(SUPPORT_GEMM3M),1)
 level3: $(B3) $(S3) $(D3) $(C3) $(Z3) level3_3m
 else
 level3: $(B3) $(S3) $(D3) $(C3) $(Z3)

 endif

 ifneq ($(CROSS), 1)
 	rm -f ?BLAT3.SUMM
@@ -263,7 +266,7 @@ endif
 endif


 level3_3m : zblat3_3m cblat3_3m
 level3_3m: zblat3_3m cblat3_3m
 ifneq ($(CROSS), 1)
 	rm -f ?BLAT3_3M.SUMM
 	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat