| @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 27.dev) | |||
| set(OpenBLAS_PATCH_VERSION 28.dev) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| @@ -22,6 +22,8 @@ option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS | |||
| option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) | |||
| set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)") | |||
| option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) | |||
| option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF) | |||
| @@ -30,7 +32,7 @@ option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OF | |||
| option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | |||
| option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) | |||
| option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64, ppc or RISCV64-RVV1.0 only)" OFF) | |||
| option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) | |||
| @@ -256,6 +258,10 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago | |||
| endif() | |||
| endif() | |||
| if (APPLE AND BUILD_SHARED_LIBS) | |||
| set(CMAKE_MACOSX_RPATH ON) | |||
| endif() | |||
| # Seems that this hack doesn't required since macOS 11 Big Sur | |||
| if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) | |||
| set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| @@ -1,4 +1,127 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.28 | |||
| 8-Aug-2024 | |||
| general: | |||
| - Reworked the unfinished implementation of HUGETLB from GotoBLAS | |||
| for allocating huge memory pages as buffers on suitable systems | |||
| - Changed the unfinished implementation of GEMM3M for the generic | |||
| target on all architectures to at least forward to regular GEMM | |||
| - Improved multithreaded GEMM performance for large non-skinny matrices | |||
| - Improved BLAS3 performance on larger multicore systems through improved | |||
| parallelism | |||
| - Improved performance of the initial memory allocation by reducing | |||
| locking overhead | |||
| - Improved performance of GBMV at small problem sizes by introducing | |||
| a size barrier for the switch to multithreading | |||
| - Added an implementation of the CBLAS_GEMM_BATCH extension | |||
| - Fixed miscompilation of CAXPYC and ZAXPYC on all architectures in | |||
| CMAKE builds (error introduced in 0.3.27) | |||
| - Fixed corner cases involving the handling of NAN and INFINITY | |||
| arguments in ?SCAL on all architectures | |||
| - Added support for cross-compiling to WEBM with CMAKE (in addition | |||
| to the already present makefile support) | |||
| - Fixed NAN handling and potential accuracy issues in compilations with | |||
| Intel ICX by supplying a suitable fp-model option by default | |||
| - The contents of the github project wiki have been converted into | |||
| a new set of documentation included with the source code. | |||
| - It is now possible to register a callback function that replaces | |||
| the built-in support for multithreading with an external backend | |||
| like TBB (openblas_set_threads_callback_function) | |||
| - Fixed potential duplication of suffixes in shared library naming | |||
| - Improved C compiler detection by the build system to tolerate more | |||
| naming variants for gcc builds | |||
| - Fixed an unnecessary dependency of the utest on CBLAS | |||
| - Fixed spurious error reports from the BLAS extensions utest | |||
| - Fixed unwanted invocation of the GEMM3M tests in cross-compilation | |||
| - Fixed a flaw in the makefile build that could lead to the pkgconfig | |||
| file containing an entry of UNKNOWN for the target cpu after installing | |||
| - Integrated fixes from the Reference-LAPACK project: | |||
| - Fixed uninitialized variables in the LAPACK tests for ?QP3RK (PR 961) | |||
| - Fixed potential bounds error in ?UNHR_COL/?ORHR_COL (PR 1018) | |||
| - Fixed potential infinite loop in the LAPACK testsuite (PR 1024) | |||
| - Make the variable type used for hidden length arguments configurable (PR 1025) | |||
| - Fixed SYTRD workspace computation and various typos (PR 1030) | |||
| - Prevent compiler use of FMA that could increase numerical error in ?GEEVX (PR 1033) | |||
| x86-64: | |||
| - reverted thread management under Windows to its state before 0.3.26 | |||
| due to signs of race conditions in some circumstances now under study | |||
| - fixed accidental selection of the unoptimized generic SBGEMM kernel | |||
| in CMAKE builds for CooperLake and SapphireRapids targets | |||
| - fixed a potential thread buffer overrun in SBSTOBF16 on small systems | |||
| - fixed an accuracy issue in ZSCAL introduced in 0.3.26 | |||
| - fixed compilation with CMAKE and recent releases of LLVM | |||
| - added support for Intel Emerald Rapids and Meteor Lake cpus | |||
| - added autodetection support for the Zhaoxin KX-7000 cpu | |||
| - fixed autodetection of Intel Prescott (probably broken since 0.3.19) | |||
| - fixed compilation for older targets with the Yocto SDK | |||
| - fixed compilation of the converter-generated C versions | |||
| of the LAPACK sources with gcc-14 | |||
| - improved compiler options when building with CMAKE and LLVM for | |||
| AVX512-capable targets | |||
| - added support for supplying the L2 cache size via an environment | |||
| variable (OPENBLAS_L2_SIZE) in case it is not correctly reported | |||
| (as in some VM configurations) | |||
| - improved the error message shown when thread creation fails on startup | |||
| - fixed setting the rpath entry of the dylib in CMAKE builds on MacOS | |||
| arm: | |||
| - fixed building for baremetal targets with make | |||
| arm64: | |||
| - Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1 | |||
| matrix to the corresponding GEMV kernel | |||
| - added optimized SGEMV and DGEMV kernels for A64FX | |||
| - added optimized SVE kernels for small-matrix GEMM | |||
| - added A64FX to the cpu list for DYNAMIC_ARCH | |||
| - fixed building with support for cpu affinity | |||
| - worked around accuracy problems with C/ZNRM2 on NeoverseN1 and | |||
| Apple M targets | |||
| - improved GEMM performance on Neoverse V1 | |||
| - fixed compilation for NEOVERSEN2 with older compilers | |||
| - fixed potential miscompilation of the SVE SDOT and DDOT kernels | |||
| - fixed potential miscompilation of the non-SVE CDOT and ZDOT kernels | |||
| - fixed a potential overflow when using very large user-defined BUFFERSIZE | |||
| - fixed setting the rpath entry of the dylib in CMAKE builds on MacOS | |||
| power: | |||
| - Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1 | |||
| matrix to the corresponding GEMV kernel | |||
| - significantly improved performance of SBGEMM on POWER10 | |||
| - fixed compilation with OpenMP and the XLF compiler | |||
| - fixed building of the BLAS extension utests under AIX | |||
| - fixed building of parts of the LAPACK testsuite with XLF | |||
| - fixed CSWAP/ZSWAP on big-endian POWER10 targets | |||
| - fixed a performance regression in SAXPY on POWER10 with OpenXL | |||
| - fixed accuracy issues in CSCAL/ZSCAL when compiled with LLVM | |||
| - fixed building for POWER9 under FreeBSD | |||
| - fixed a potential overflow when using very large user-defined BUFFERSIZE | |||
| - fixed an accuracy issue in the POWER6 kernels for GEMM and GEMV | |||
| riscv64: | |||
| - Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1 | |||
| matrix to the corresponding GEMV kernel | |||
| - fixed building for RISCV64_GENERIC with OpenMP enabled | |||
| - added DYNAMIC_ARCH support (comprising GENERIC_RISCV64 and the two | |||
| RVV 1.0 targets with vector length of 128 and 256) | |||
| - worked around the ZVL128B kernels for AXPBY mishandling the special | |||
| case of zero Y increment | |||
| loongarch64: | |||
| - improved GEMM performance on servers of the 3C5000 generation | |||
| - improved performance and stability of DGEMM | |||
| - improved GEMV and TRSM kernels for LSX and LASX vector ABIs | |||
| - fixed CMAKE compilation with the INTERFACE64 option set | |||
| - fixed compilation with CMAKE | |||
| - worked around spurious errors flagged by the BLAS3 tests | |||
| - worked around a miscompilation of the POTRS utest by gcc 14.1 | |||
| mips64: | |||
| - fixed ASUM and SUM kernels to accept negative step sizes in X | |||
| - fixed complex GEMV kernels for MSA | |||
| ==================================================================== | |||
| Version 0.3.27 | |||
| 4-Apr-2024 | |||
| @@ -45,6 +45,10 @@ else | |||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) | |||
| endif | |||
| ifdef LAPACK_STRLEN | |||
| LAPACK_FFLAGS += -DLAPACK_STRLEN=$(LAPACK_STRLEN) | |||
| endif | |||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test | |||
| .PHONY : all libs netlib $(RELA) test ctest shared install | |||
| @@ -178,7 +178,7 @@ endif | |||
| @echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)" | |||
| @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" | |||
| @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" | |||
| @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" | |||
| @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" | |||
| @echo 'version='$(VERSION) >> "$(PKGFILE)" | |||
| @echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)" | |||
| @cat openblas.pc.in >> "$(PKGFILE)" | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.27.dev | |||
| VERSION = 0.3.28.dev | |||
| # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a | |||
| # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library | |||
| @@ -134,6 +134,12 @@ VERSION = 0.3.27.dev | |||
| # Build LAPACK Deprecated functions since LAPACK 3.6.0 | |||
| BUILD_LAPACK_DEPRECATED = 1 | |||
| # The variable type assumed for the length of character arguments when passing | |||
| # data between Fortran LAPACK and C BLAS (defaults to "size_t", but older GCC | |||
| # versions used "int"). Mismatches will not cause runtime failures but may result | |||
| # in build warnings or errors when building with link-time optimization (LTO) | |||
| # LAPACK_STRLEN=int | |||
| # Build RecursiveLAPACK on top of LAPACK | |||
| # BUILD_RELAPACK = 1 | |||
| # Have RecursiveLAPACK actually replace standard LAPACK routines instead of | |||
| @@ -277,6 +277,12 @@ endif | |||
| ifeq ($(ARCH), arm64) | |||
| GEMM_GEMV_FORWARD = 1 | |||
| endif | |||
| ifeq ($(ARCH), riscv) | |||
| GEMM_GEMV_FORWARD = 1 | |||
| endif | |||
| ifeq ($(ARCH), power) | |||
| GEMM_GEMV_FORWARD = 1 | |||
| endif | |||
| ifeq ($(SMALL_MATRIX_OPT), 1) | |||
| CCOMMON_OPT += -DSMALL_MATRIX_OPT | |||
| @@ -57,7 +57,11 @@ if (DYNAMIC_ARCH) | |||
| set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT") | |||
| endif () | |||
| if (RISCV64) | |||
| set(DYNAMIC_CORE RISCV64_GENERIC RISCV64_ZVL128B RISCV64_ZVL256B) | |||
| endif () | |||
| if (X86) | |||
| set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) | |||
| endif () | |||
| @@ -403,7 +403,7 @@ if (SMALL_MATRIX_OPT) | |||
| endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (X86 OR X86_64 OR ARM64 OR POWER) | |||
| if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | |||
| if (DYNAMIC_OLDER) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") | |||
| @@ -621,7 +621,10 @@ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}") | |||
| set(FPFLAGS "${FPFLAGS} ${FCOMMON_OPT} ${COMMON_PROF}") | |||
| #For LAPACK Fortran codes. | |||
| set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}") | |||
| set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}" ) | |||
| if (LAPACK_STRLEN) | |||
| set (LAPACK_FFLAGS "${LAPACK_FFLAGS} -DLAPACK_STRLEN=${LAPACK_STRLEN}") | |||
| endif() | |||
| set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") | |||
| #Disable -fopenmp for LAPACK Fortran codes on Windows. | |||
| @@ -111,8 +111,8 @@ typedef struct blas_queue { | |||
| struct blas_queue *next; | |||
| #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__) | |||
| // CRITICAL_SECTION lock; | |||
| // HANDLE finish; | |||
| CRITICAL_SECTION lock; | |||
| HANDLE finish; | |||
| volatile int finished; | |||
| #else | |||
| pthread_mutex_t lock; | |||
| @@ -52,6 +52,8 @@ if (DYNAMIC_ARCH) | |||
| list(APPEND COMMON_SOURCES dynamic_arm64.c) | |||
| elseif (POWER) | |||
| list(APPEND COMMON_SOURCES dynamic_power.c) | |||
| elseif (RISCV64) | |||
| list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c) | |||
| else () | |||
| list(APPEND COMMON_SOURCES dynamic.c) | |||
| endif () | |||
| @@ -2769,7 +2769,7 @@ void *blas_memory_alloc(int procpos){ | |||
| #ifdef ALLOC_DEVICEDRIVER | |||
| alloc_devicedirver, | |||
| #endif | |||
| #ifdef ALLOC_SHM && !defined(ALLOC_HUGETLB) | |||
| #if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB) | |||
| alloc_shm, | |||
| #endif | |||
| #if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) | |||
| @@ -498,7 +498,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| args.m, args.n, args.k, args.lda, args.ldb, args.ldc); | |||
| #endif | |||
| #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) | |||
| #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
| // Check if we can convert GEMM -> GEMV | |||
| if (args.k != 0) { | |||
| if (args.n == 1) { | |||
| @@ -17,15 +17,6 @@ ifeq ($(ARCH), ia64) | |||
| USE_GEMM3M = 1 | |||
| endif | |||
| ifneq ($(DYNAMIC_ARCH), 1) | |||
| ifeq ($(TARGET), GENERIC) | |||
| USE_GEMM3M = 0 | |||
| endif | |||
| else | |||
| ifeq ($(CORE), GENERIC) | |||
| USE_GEMM3M = 0 | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), arm) | |||
| USE_TRMM = 1 | |||
| @@ -25,10 +25,16 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if 1 | |||
| #include "zgemmkernel_2x2.c" | |||
| #else | |||
| #include "common.h" | |||
| int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alphar, FLOAT alphai, FLOAT * ba, FLOAT * bb, FLOAT * C, BLASLONG ldc) | |||
| { | |||
| return 0; | |||
| } | |||
| #endif | |||
| @@ -242,4 +242,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -200,4 +200,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -163,7 +163,8 @@ | |||
| *> \endverbatim | |||
| *> | |||
| * ===================================================================== | |||
| SUBROUTINE CGEHRD( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO ) | |||
| SUBROUTINE CGEHRD( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, | |||
| $ INFO ) | |||
| * | |||
| * -- LAPACK computational routine -- | |||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| @@ -193,7 +194,8 @@ | |||
| COMPLEX EI | |||
| * .. | |||
| * .. External Subroutines .. | |||
| EXTERNAL CAXPY, CGEHD2, CGEMM, CLAHR2, CLARFB, CTRMM, | |||
| EXTERNAL CAXPY, CGEHD2, CGEMM, CLAHR2, CLARFB, | |||
| $ CTRMM, | |||
| $ XERBLA | |||
| * .. | |||
| * .. Intrinsic Functions .. | |||
| @@ -230,7 +232,7 @@ | |||
| IF( NH.LE.1 ) THEN | |||
| LWKOPT = 1 | |||
| ELSE | |||
| NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, | |||
| NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI, | |||
| $ -1 ) ) | |||
| LWKOPT = N*NB + TSIZE | |||
| END IF | |||
| @@ -139,7 +139,7 @@ | |||
| *> \author Univ. of Colorado Denver | |||
| *> \author NAG Ltd. | |||
| * | |||
| *> \ingroup complexHEcomputational | |||
| *> \ingroup hetrd | |||
| * | |||
| *> \par Further Details: | |||
| * ===================== | |||
| @@ -188,7 +188,8 @@ | |||
| *> \endverbatim | |||
| *> | |||
| * ===================================================================== | |||
| SUBROUTINE CHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO ) | |||
| SUBROUTINE CHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, | |||
| $ INFO ) | |||
| * | |||
| * -- LAPACK computational routine -- | |||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| @@ -225,7 +226,8 @@ | |||
| * .. External Functions .. | |||
| LOGICAL LSAME | |||
| INTEGER ILAENV | |||
| EXTERNAL LSAME, ILAENV | |||
| REAL SROUNDUP_LWORK | |||
| EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| @@ -249,8 +251,8 @@ | |||
| * Determine the block size. | |||
| * | |||
| NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) | |||
| LWKOPT = N*NB | |||
| WORK( 1 ) = LWKOPT | |||
| LWKOPT = MAX( 1, N*NB ) | |||
| WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) | |||
| END IF | |||
| * | |||
| IF( INFO.NE.0 ) THEN | |||
| @@ -367,7 +369,7 @@ | |||
| $ TAU( I ), IINFO ) | |||
| END IF | |||
| * | |||
| WORK( 1 ) = LWKOPT | |||
| WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) | |||
| RETURN | |||
| * | |||
| * End of CHETRD | |||
| @@ -109,7 +109,7 @@ | |||
| *> \author Univ. of Colorado Denver | |||
| *> \author NAG Ltd. | |||
| * | |||
| *> \ingroup doubleOTHERauxiliary | |||
| *> \ingroup lanv2 | |||
| * | |||
| *> \par Further Details: | |||
| * ===================== | |||
| @@ -144,7 +144,7 @@ | |||
| * .. | |||
| * .. Local Scalars .. | |||
| DOUBLE PRECISION AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, | |||
| $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, | |||
| $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, | |||
| $ SAFMN2, SAFMX2 | |||
| INTEGER COUNT | |||
| * .. | |||
| @@ -248,10 +248,14 @@ | |||
| * | |||
| * Compute [ A B ] = [ CS SN ] [ AA BB ] | |||
| * [ C D ] [-SN CS ] [ CC DD ] | |||
| * | |||
| * Note: Some of the multiplications are wrapped in parentheses to | |||
| * prevent compilers from using FMA instructions. See | |||
| * https://github.com/Reference-LAPACK/lapack/issues/1031. | |||
| * | |||
| A = AA*CS + CC*SN | |||
| B = BB*CS + DD*SN | |||
| C = -AA*SN + CC*CS | |||
| B = ( BB*CS ) + ( DD*SN ) | |||
| C = -( AA*SN ) + ( CC*CS ) | |||
| D = -BB*SN + DD*CS | |||
| * | |||
| TEMP = HALF*( A+D ) | |||
| @@ -18,7 +18,7 @@ | |||
| *> | |||
| *> \verbatim | |||
| *> | |||
| *> DGELQT computes a blocked LQ factorization of a real M-by-N matrix A | |||
| *> SGELQT computes a blocked LQ factorization of a real M-by-N matrix A | |||
| *> using the compact WY representation of Q. | |||
| *> \endverbatim | |||
| * | |||
| @@ -93,7 +93,7 @@ | |||
| *> \author Univ. of Colorado Denver | |||
| *> \author NAG Ltd. | |||
| * | |||
| *> \ingroup doubleGEcomputational | |||
| *> \ingroup gelqt | |||
| * | |||
| *> \par Further Details: | |||
| * ===================== | |||
| @@ -74,7 +74,7 @@ | |||
| *> A is REAL array, dimension | |||
| *> (LDA,M) if SIDE = 'L', | |||
| *> (LDA,N) if SIDE = 'R' | |||
| *> Part of the data structure to represent Q as returned by DGELQ. | |||
| *> Part of the data structure to represent Q as returned by SGELQ. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] LDA | |||
| @@ -20,7 +20,7 @@ | |||
| *> | |||
| *> \verbatim | |||
| *> | |||
| *> DGEMLQT overwrites the general real M-by-N matrix C with | |||
| *> SGEMLQT overwrites the general real M-by-N matrix C with | |||
| *> | |||
| *> SIDE = 'L' SIDE = 'R' | |||
| *> TRANS = 'N': Q C C Q | |||
| @@ -145,7 +145,7 @@ | |||
| *> \author Univ. of Colorado Denver | |||
| *> \author NAG Ltd. | |||
| * | |||
| *> \ingroup doubleGEcomputational | |||
| *> \ingroup gemlqt | |||
| * | |||
| * ===================================================================== | |||
| SUBROUTINE SGEMLQT( SIDE, TRANS, M, N, K, MB, V, LDV, T, LDT, | |||
| @@ -109,7 +109,7 @@ | |||
| *> \author Univ. of Colorado Denver | |||
| *> \author NAG Ltd. | |||
| * | |||
| *> \ingroup realOTHERauxiliary | |||
| *> \ingroup lanv2 | |||
| * | |||
| *> \par Further Details: | |||
| * ===================== | |||
| @@ -144,7 +144,7 @@ | |||
| * .. | |||
| * .. Local Scalars .. | |||
| REAL AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, | |||
| $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, | |||
| $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, | |||
| $ SAFMN2, SAFMX2 | |||
| INTEGER COUNT | |||
| * .. | |||
| @@ -248,10 +248,14 @@ | |||
| * | |||
| * Compute [ A B ] = [ CS SN ] [ AA BB ] | |||
| * [ C D ] [-SN CS ] [ CC DD ] | |||
| * | |||
| * Note: Some of the multiplications are wrapped in parentheses to | |||
| * prevent compilers from using FMA instructions. See | |||
| * https://github.com/Reference-LAPACK/lapack/issues/1031. | |||
| * | |||
| A = AA*CS + CC*SN | |||
| B = BB*CS + DD*SN | |||
| C = -AA*SN + CC*CS | |||
| B = ( BB*CS ) + ( DD*SN ) | |||
| C = -( AA*SN ) + ( CC*CS ) | |||
| D = -BB*SN + DD*CS | |||
| * | |||
| TEMP = HALF*( A+D ) | |||
| @@ -188,7 +188,8 @@ | |||
| *> \endverbatim | |||
| *> | |||
| * ===================================================================== | |||
| SUBROUTINE SSYTRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO ) | |||
| SUBROUTINE SSYTRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, | |||
| $ INFO ) | |||
| * | |||
| * -- LAPACK computational routine -- | |||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| @@ -248,7 +249,7 @@ | |||
| * Determine the block size. | |||
| * | |||
| NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) | |||
| LWKOPT = N*NB | |||
| LWKOPT = MAX( 1, N*NB ) | |||
| WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) | |||
| END IF | |||
| * | |||
| @@ -316,7 +317,8 @@ | |||
| * Update the unreduced submatrix A(1:i-1,1:i-1), using an | |||
| * update of the form: A := A - V*W**T - W*V**T | |||
| * | |||
| CALL SSYR2K( UPLO, 'No transpose', I-1, NB, -ONE, A( 1, I ), | |||
| CALL SSYR2K( UPLO, 'No transpose', I-1, NB, -ONE, A( 1, | |||
| $ I ), | |||
| $ LDA, WORK, LDWORK, ONE, A, LDA ) | |||
| * | |||
| * Copy superdiagonal elements back into A, and diagonal | |||
| @@ -139,7 +139,7 @@ | |||
| *> \author Univ. of Colorado Denver | |||
| *> \author NAG Ltd. | |||
| * | |||
| *> \ingroup complex16HEcomputational | |||
| *> \ingroup hetrd | |||
| * | |||
| *> \par Further Details: | |||
| * ===================== | |||
| @@ -188,7 +188,8 @@ | |||
| *> \endverbatim | |||
| *> | |||
| * ===================================================================== | |||
| SUBROUTINE ZHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO ) | |||
| SUBROUTINE ZHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, | |||
| $ INFO ) | |||
| * | |||
| * -- LAPACK computational routine -- | |||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| @@ -249,7 +250,7 @@ | |||
| * Determine the block size. | |||
| * | |||
| NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) | |||
| LWKOPT = N*NB | |||
| LWKOPT = MAX( 1, N*NB ) | |||
| WORK( 1 ) = LWKOPT | |||
| END IF | |||
| * | |||
| @@ -189,8 +189,11 @@ endif | |||
| endif | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| level3: $(B3) $(S3) $(D3) $(C3) $(Z3) level3_3m | |||
| else | |||
| level3: $(B3) $(S3) $(D3) $(C3) $(Z3) | |||
| endif | |||
| ifneq ($(CROSS), 1) | |||
| rm -f ?BLAT3.SUMM | |||
| @@ -263,7 +266,7 @@ endif | |||
| endif | |||
| level3_3m : zblat3_3m cblat3_3m | |||
| level3_3m: zblat3_3m cblat3_3m | |||
| ifneq ($(CROSS), 1) | |||
| rm -f ?BLAT3_3M.SUMM | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat | |||