| @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) | |||||
| set(OpenBLAS_MAJOR_VERSION 0) | set(OpenBLAS_MAJOR_VERSION 0) | ||||
| set(OpenBLAS_MINOR_VERSION 3) | set(OpenBLAS_MINOR_VERSION 3) | ||||
| set(OpenBLAS_PATCH_VERSION 27.dev) | |||||
| set(OpenBLAS_PATCH_VERSION 28.dev) | |||||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | ||||
| @@ -22,6 +22,8 @@ option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS | |||||
| option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) | option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) | ||||
| set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)") | |||||
| option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) | option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) | ||||
| option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF) | option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF) | ||||
| @@ -30,7 +32,7 @@ option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OF | |||||
| option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | ||||
| option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) | |||||
| option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64, ppc or RISCV64-RVV1.0 only)" OFF) | |||||
| option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) | option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) | ||||
| @@ -256,6 +258,10 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago | |||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| if (APPLE AND BUILD_SHARED_LIBS) | |||||
| set(CMAKE_MACOSX_RPATH ON) | |||||
| endif() | |||||
| # Seems that this hack doesn't required since macOS 11 Big Sur | # Seems that this hack doesn't required since macOS 11 Big Sur | ||||
| if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) | if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) | ||||
| set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | ||||
| @@ -1,4 +1,127 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.3.28 | |||||
| 8-Aug-2024 | |||||
| general: | |||||
| - Reworked the unfinished implementation of HUGETLB from GotoBLAS | |||||
| for allocating huge memory pages as buffers on suitable systems | |||||
| - Changed the unfinished implementation of GEMM3M for the generic | |||||
| target on all architectures to at least forward to regular GEMM | |||||
| - Improved multithreaded GEMM performance for large non-skinny matrices | |||||
| - Improved BLAS3 performance on larger multicore systems through improved | |||||
| parallelism | |||||
| - Improved performance of the initial memory allocation by reducing | |||||
| locking overhead | |||||
| - Improved performance of GBMV at small problem sizes by introducing | |||||
| a size barrier for the switch to multithreading | |||||
| - Added an implementation of the CBLAS_GEMM_BATCH extension | |||||
| - Fixed miscompilation of CAXPYC and ZAXPYC on all architectures in | |||||
| CMAKE builds (error introduced in 0.3.27) | |||||
| - Fixed corner cases involving the handling of NAN and INFINITY | |||||
| arguments in ?SCAL on all architectures | |||||
| - Added support for cross-compiling to WEBM with CMAKE (in addition | |||||
| to the already present makefile support) | |||||
| - Fixed NAN handling and potential accuracy issues in compilations with | |||||
| Intel ICX by supplying a suitable fp-model option by default | |||||
| - The contents of the github project wiki have been converted into | |||||
| a new set of documentation included with the source code. | |||||
| - It is now possible to register a callback function that replaces | |||||
| the built-in support for multithreading with an external backend | |||||
| like TBB (openblas_set_threads_callback_function) | |||||
| - Fixed potential duplication of suffixes in shared library naming | |||||
| - Improved C compiler detection by the build system to tolerate more | |||||
| naming variants for gcc builds | |||||
| - Fixed an unnecessary dependency of the utest on CBLAS | |||||
| - Fixed spurious error reports from the BLAS extensions utest | |||||
| - Fixed unwanted invocation of the GEMM3M tests in cross-compilation | |||||
| - Fixed a flaw in the makefile build that could lead to the pkgconfig | |||||
| file containing an entry of UNKNOWN for the target cpu after installing | |||||
| - Integrated fixes from the Reference-LAPACK project: | |||||
| - Fixed uninitialized variables in the LAPACK tests for ?QP3RK (PR 961) | |||||
| - Fixed potential bounds error in ?UNHR_COL/?ORHR_COL (PR 1018) | |||||
| - Fixed potential infinite loop in the LAPACK testsuite (PR 1024) | |||||
| - Make the variable type used for hidden length arguments configurable (PR 1025) | |||||
| - Fixed SYTRD workspace computation and various typos (PR 1030) | |||||
| - Prevent compiler use of FMA that could increase numerical error in ?GEEVX (PR 1033) | |||||
| x86-64: | |||||
| - reverted thread management under Windows to its state before 0.3.26 | |||||
| due to signs of race conditions in some circumstances now under study | |||||
| - fixed accidental selection of the unoptimized generic SBGEMM kernel | |||||
| in CMAKE builds for CooperLake and SapphireRapids targets | |||||
| - fixed a potential thread buffer overrun in SBSTOBF16 on small systems | |||||
| - fixed an accuracy issue in ZSCAL introduced in 0.3.26 | |||||
| - fixed compilation with CMAKE and recent releases of LLVM | |||||
| - added support for Intel Emerald Rapids and Meteor Lake cpus | |||||
| - added autodetection support for the Zhaoxin KX-7000 cpu | |||||
| - fixed autodetection of Intel Prescott (probably broken since 0.3.19) | |||||
| - fixed compilation for older targets with the Yocto SDK | |||||
| - fixed compilation of the converter-generated C versions | |||||
| of the LAPACK sources with gcc-14 | |||||
| - improved compiler options when building with CMAKE and LLVM for | |||||
| AVX512-capable targets | |||||
| - added support for supplying the L2 cache size via an environment | |||||
| variable (OPENBLAS_L2_SIZE) in case it is not correctly reported | |||||
| (as in some VM configurations) | |||||
| - improved the error message shown when thread creation fails on startup | |||||
| - fixed setting the rpath entry of the dylib in CMAKE builds on MacOS | |||||
| arm: | |||||
| - fixed building for baremetal targets with make | |||||
| arm64: | |||||
| - Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1 | |||||
| matrix to the corresponding GEMV kernel | |||||
| - added optimized SGEMV and DGEMV kernels for A64FX | |||||
| - added optimized SVE kernels for small-matrix GEMM | |||||
| - added A64FX to the cpu list for DYNAMIC_ARCH | |||||
| - fixed building with support for cpu affinity | |||||
| - worked around accuracy problems with C/ZNRM2 on NeoverseN1 and | |||||
| Apple M targets | |||||
| - improved GEMM performance on Neoverse V1 | |||||
| - fixed compilation for NEOVERSEN2 with older compilers | |||||
| - fixed potential miscompilation of the SVE SDOT and DDOT kernels | |||||
| - fixed potential miscompilation of the non-SVE CDOT and ZDOT kernels | |||||
| - fixed a potential overflow when using very large user-defined BUFFERSIZE | |||||
| - fixed setting the rpath entry of the dylib in CMAKE builds on MacOS | |||||
| power: | |||||
| - Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1 | |||||
| matrix to the corresponding GEMV kernel | |||||
| - significantly improved performance of SBGEMM on POWER10 | |||||
| - fixed compilation with OpenMP and the XLF compiler | |||||
| - fixed building of the BLAS extension utests under AIX | |||||
| - fixed building of parts of the LAPACK testsuite with XLF | |||||
| - fixed CSWAP/ZSWAP on big-endian POWER10 targets | |||||
| - fixed a performance regression in SAXPY on POWER10 with OpenXL | |||||
| - fixed accuracy issues in CSCAL/ZSCAL when compiled with LLVM | |||||
| - fixed building for POWER9 under FreeBSD | |||||
| - fixed a potential overflow when using very large user-defined BUFFERSIZE | |||||
| - fixed an accuracy issue in the POWER6 kernels for GEMM and GEMV | |||||
| riscv64: | |||||
| - Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1 | |||||
| matrix to the corresponding GEMV kernel | |||||
| - fixed building for RISCV64_GENERIC with OpenMP enabled | |||||
| - added DYNAMIC_ARCH support (comprising GENERIC_RISCV64 and the two | |||||
| RVV 1.0 targets with vector length of 128 and 256) | |||||
| - worked around the ZVL128B kernels for AXPBY mishandling the special | |||||
| case of zero Y increment | |||||
| loongarch64: | |||||
| - improved GEMM performance on servers of the 3C5000 generation | |||||
| - improved performance and stability of DGEMM | |||||
| - improved GEMV and TRSM kernels for LSX and LASX vector ABIs | |||||
| - fixed CMAKE compilation with the INTERFACE64 option set | |||||
| - fixed compilation with CMAKE | |||||
| - worked around spurious errors flagged by the BLAS3 tests | |||||
| - worked around a miscompilation of the POTRS utest by gcc 14.1 | |||||
| mips64: | |||||
| - fixed ASUM and SUM kernels to accept negative step sizes in X | |||||
| - fixed complex GEMV kernels for MSA | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.3.27 | Version 0.3.27 | ||||
| 4-Apr-2024 | 4-Apr-2024 | ||||
| @@ -45,6 +45,10 @@ else | |||||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) | LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) | ||||
| endif | endif | ||||
| ifdef LAPACK_STRLEN | |||||
| LAPACK_FFLAGS += -DLAPACK_STRLEN=$(LAPACK_STRLEN) | |||||
| endif | |||||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test | SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test | ||||
| .PHONY : all libs netlib $(RELA) test ctest shared install | .PHONY : all libs netlib $(RELA) test ctest shared install | ||||
| @@ -178,7 +178,7 @@ endif | |||||
| @echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)" | @echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)" | ||||
| @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" | @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" | ||||
| @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" | @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" | ||||
| @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" | |||||
| @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" | |||||
| @echo 'version='$(VERSION) >> "$(PKGFILE)" | @echo 'version='$(VERSION) >> "$(PKGFILE)" | ||||
| @echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)" | @echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)" | ||||
| @cat openblas.pc.in >> "$(PKGFILE)" | @cat openblas.pc.in >> "$(PKGFILE)" | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.3.27.dev | |||||
| VERSION = 0.3.28.dev | |||||
| # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a | # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a | ||||
| # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library | # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library | ||||
| @@ -134,6 +134,12 @@ VERSION = 0.3.27.dev | |||||
| # Build LAPACK Deprecated functions since LAPACK 3.6.0 | # Build LAPACK Deprecated functions since LAPACK 3.6.0 | ||||
| BUILD_LAPACK_DEPRECATED = 1 | BUILD_LAPACK_DEPRECATED = 1 | ||||
| # The variable type assumed for the length of character arguments when passing | |||||
| # data between Fortran LAPACK and C BLAS (defaults to "size_t", but older GCC | |||||
| # versions used "int"). Mismatches will not cause runtime failures but may result | |||||
| # in build warnings or errors when building with link-time optimization (LTO) | |||||
| # LAPACK_STRLEN=int | |||||
| # Build RecursiveLAPACK on top of LAPACK | # Build RecursiveLAPACK on top of LAPACK | ||||
| # BUILD_RELAPACK = 1 | # BUILD_RELAPACK = 1 | ||||
| # Have RecursiveLAPACK actually replace standard LAPACK routines instead of | # Have RecursiveLAPACK actually replace standard LAPACK routines instead of | ||||
| @@ -277,6 +277,12 @@ endif | |||||
| ifeq ($(ARCH), arm64) | ifeq ($(ARCH), arm64) | ||||
| GEMM_GEMV_FORWARD = 1 | GEMM_GEMV_FORWARD = 1 | ||||
| endif | endif | ||||
| ifeq ($(ARCH), riscv) | |||||
| GEMM_GEMV_FORWARD = 1 | |||||
| endif | |||||
| ifeq ($(ARCH), power) | |||||
| GEMM_GEMV_FORWARD = 1 | |||||
| endif | |||||
| ifeq ($(SMALL_MATRIX_OPT), 1) | ifeq ($(SMALL_MATRIX_OPT), 1) | ||||
| CCOMMON_OPT += -DSMALL_MATRIX_OPT | CCOMMON_OPT += -DSMALL_MATRIX_OPT | ||||
| @@ -57,7 +57,11 @@ if (DYNAMIC_ARCH) | |||||
| set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) | set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT") | set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT") | ||||
| endif () | endif () | ||||
| if (RISCV64) | |||||
| set(DYNAMIC_CORE RISCV64_GENERIC RISCV64_ZVL128B RISCV64_ZVL256B) | |||||
| endif () | |||||
| if (X86) | if (X86) | ||||
| set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) | set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) | ||||
| endif () | endif () | ||||
| @@ -403,7 +403,7 @@ if (SMALL_MATRIX_OPT) | |||||
| endif () | endif () | ||||
| if (DYNAMIC_ARCH) | if (DYNAMIC_ARCH) | ||||
| if (X86 OR X86_64 OR ARM64 OR POWER) | |||||
| if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | ||||
| if (DYNAMIC_OLDER) | if (DYNAMIC_OLDER) | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") | set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") | ||||
| @@ -621,7 +621,10 @@ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}") | |||||
| set(FPFLAGS "${FPFLAGS} ${FCOMMON_OPT} ${COMMON_PROF}") | set(FPFLAGS "${FPFLAGS} ${FCOMMON_OPT} ${COMMON_PROF}") | ||||
| #For LAPACK Fortran codes. | #For LAPACK Fortran codes. | ||||
| set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}") | |||||
| set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}" ) | |||||
| if (LAPACK_STRLEN) | |||||
| set (LAPACK_FFLAGS "${LAPACK_FFLAGS} -DLAPACK_STRLEN=${LAPACK_STRLEN}") | |||||
| endif() | |||||
| set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") | set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") | ||||
| #Disable -fopenmp for LAPACK Fortran codes on Windows. | #Disable -fopenmp for LAPACK Fortran codes on Windows. | ||||
| @@ -111,8 +111,8 @@ typedef struct blas_queue { | |||||
| struct blas_queue *next; | struct blas_queue *next; | ||||
| #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__) | #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__) | ||||
| // CRITICAL_SECTION lock; | |||||
| // HANDLE finish; | |||||
| CRITICAL_SECTION lock; | |||||
| HANDLE finish; | |||||
| volatile int finished; | volatile int finished; | ||||
| #else | #else | ||||
| pthread_mutex_t lock; | pthread_mutex_t lock; | ||||
| @@ -52,6 +52,8 @@ if (DYNAMIC_ARCH) | |||||
| list(APPEND COMMON_SOURCES dynamic_arm64.c) | list(APPEND COMMON_SOURCES dynamic_arm64.c) | ||||
| elseif (POWER) | elseif (POWER) | ||||
| list(APPEND COMMON_SOURCES dynamic_power.c) | list(APPEND COMMON_SOURCES dynamic_power.c) | ||||
| elseif (RISCV64) | |||||
| list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c) | |||||
| else () | else () | ||||
| list(APPEND COMMON_SOURCES dynamic.c) | list(APPEND COMMON_SOURCES dynamic.c) | ||||
| endif () | endif () | ||||
| @@ -2769,7 +2769,7 @@ void *blas_memory_alloc(int procpos){ | |||||
| #ifdef ALLOC_DEVICEDRIVER | #ifdef ALLOC_DEVICEDRIVER | ||||
| alloc_devicedirver, | alloc_devicedirver, | ||||
| #endif | #endif | ||||
| #ifdef ALLOC_SHM && !defined(ALLOC_HUGETLB) | |||||
| #if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB) | |||||
| alloc_shm, | alloc_shm, | ||||
| #endif | #endif | ||||
| #if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) | #if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) | ||||
| @@ -498,7 +498,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| args.m, args.n, args.k, args.lda, args.ldb, args.ldc); | args.m, args.n, args.k, args.lda, args.ldb, args.ldc); | ||||
| #endif | #endif | ||||
| #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) | |||||
| #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(BFLOAT16) | |||||
| // Check if we can convert GEMM -> GEMV | // Check if we can convert GEMM -> GEMV | ||||
| if (args.k != 0) { | if (args.k != 0) { | ||||
| if (args.n == 1) { | if (args.n == 1) { | ||||
| @@ -17,15 +17,6 @@ ifeq ($(ARCH), ia64) | |||||
| USE_GEMM3M = 1 | USE_GEMM3M = 1 | ||||
| endif | endif | ||||
| ifneq ($(DYNAMIC_ARCH), 1) | |||||
| ifeq ($(TARGET), GENERIC) | |||||
| USE_GEMM3M = 0 | |||||
| endif | |||||
| else | |||||
| ifeq ($(CORE), GENERIC) | |||||
| USE_GEMM3M = 0 | |||||
| endif | |||||
| endif | |||||
| ifeq ($(ARCH), arm) | ifeq ($(ARCH), arm) | ||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| @@ -25,10 +25,16 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #if 1 | |||||
| #include "zgemmkernel_2x2.c" | |||||
| #else | |||||
| #include "common.h" | #include "common.h" | ||||
| int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alphar, FLOAT alphai, FLOAT * ba, FLOAT * bb, FLOAT * C, BLASLONG ldc) | int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alphar, FLOAT alphai, FLOAT * ba, FLOAT * bb, FLOAT * C, BLASLONG ldc) | ||||
| { | { | ||||
| return 0; | return 0; | ||||
| } | } | ||||
| #endif | |||||
| @@ -242,4 +242,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| return(0); | |||||
| } | } | ||||
| @@ -200,4 +200,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| return(0); | |||||
| } | } | ||||
| @@ -163,7 +163,8 @@ | |||||
| *> \endverbatim | *> \endverbatim | ||||
| *> | *> | ||||
| * ===================================================================== | * ===================================================================== | ||||
| SUBROUTINE CGEHRD( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO ) | |||||
| SUBROUTINE CGEHRD( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, | |||||
| $ INFO ) | |||||
| * | * | ||||
| * -- LAPACK computational routine -- | * -- LAPACK computational routine -- | ||||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | * -- LAPACK is a software package provided by Univ. of Tennessee, -- | ||||
| @@ -193,7 +194,8 @@ | |||||
| COMPLEX EI | COMPLEX EI | ||||
| * .. | * .. | ||||
| * .. External Subroutines .. | * .. External Subroutines .. | ||||
| EXTERNAL CAXPY, CGEHD2, CGEMM, CLAHR2, CLARFB, CTRMM, | |||||
| EXTERNAL CAXPY, CGEHD2, CGEMM, CLAHR2, CLARFB, | |||||
| $ CTRMM, | |||||
| $ XERBLA | $ XERBLA | ||||
| * .. | * .. | ||||
| * .. Intrinsic Functions .. | * .. Intrinsic Functions .. | ||||
| @@ -230,7 +232,7 @@ | |||||
| IF( NH.LE.1 ) THEN | IF( NH.LE.1 ) THEN | ||||
| LWKOPT = 1 | LWKOPT = 1 | ||||
| ELSE | ELSE | ||||
| NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, | |||||
| NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI, | |||||
| $ -1 ) ) | $ -1 ) ) | ||||
| LWKOPT = N*NB + TSIZE | LWKOPT = N*NB + TSIZE | ||||
| END IF | END IF | ||||
| @@ -139,7 +139,7 @@ | |||||
| *> \author Univ. of Colorado Denver | *> \author Univ. of Colorado Denver | ||||
| *> \author NAG Ltd. | *> \author NAG Ltd. | ||||
| * | * | ||||
| *> \ingroup complexHEcomputational | |||||
| *> \ingroup hetrd | |||||
| * | * | ||||
| *> \par Further Details: | *> \par Further Details: | ||||
| * ===================== | * ===================== | ||||
| @@ -188,7 +188,8 @@ | |||||
| *> \endverbatim | *> \endverbatim | ||||
| *> | *> | ||||
| * ===================================================================== | * ===================================================================== | ||||
| SUBROUTINE CHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO ) | |||||
| SUBROUTINE CHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, | |||||
| $ INFO ) | |||||
| * | * | ||||
| * -- LAPACK computational routine -- | * -- LAPACK computational routine -- | ||||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | * -- LAPACK is a software package provided by Univ. of Tennessee, -- | ||||
| @@ -225,7 +226,8 @@ | |||||
| * .. External Functions .. | * .. External Functions .. | ||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| INTEGER ILAENV | INTEGER ILAENV | ||||
| EXTERNAL LSAME, ILAENV | |||||
| REAL SROUNDUP_LWORK | |||||
| EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK | |||||
| * .. | * .. | ||||
| * .. Executable Statements .. | * .. Executable Statements .. | ||||
| * | * | ||||
| @@ -249,8 +251,8 @@ | |||||
| * Determine the block size. | * Determine the block size. | ||||
| * | * | ||||
| NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) | NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) | ||||
| LWKOPT = N*NB | |||||
| WORK( 1 ) = LWKOPT | |||||
| LWKOPT = MAX( 1, N*NB ) | |||||
| WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) | |||||
| END IF | END IF | ||||
| * | * | ||||
| IF( INFO.NE.0 ) THEN | IF( INFO.NE.0 ) THEN | ||||
| @@ -367,7 +369,7 @@ | |||||
| $ TAU( I ), IINFO ) | $ TAU( I ), IINFO ) | ||||
| END IF | END IF | ||||
| * | * | ||||
| WORK( 1 ) = LWKOPT | |||||
| WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) | |||||
| RETURN | RETURN | ||||
| * | * | ||||
| * End of CHETRD | * End of CHETRD | ||||
| @@ -109,7 +109,7 @@ | |||||
| *> \author Univ. of Colorado Denver | *> \author Univ. of Colorado Denver | ||||
| *> \author NAG Ltd. | *> \author NAG Ltd. | ||||
| * | * | ||||
| *> \ingroup doubleOTHERauxiliary | |||||
| *> \ingroup lanv2 | |||||
| * | * | ||||
| *> \par Further Details: | *> \par Further Details: | ||||
| * ===================== | * ===================== | ||||
| @@ -144,7 +144,7 @@ | |||||
| * .. | * .. | ||||
| * .. Local Scalars .. | * .. Local Scalars .. | ||||
| DOUBLE PRECISION AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, | DOUBLE PRECISION AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, | ||||
| $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, | |||||
| $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, | |||||
| $ SAFMN2, SAFMX2 | $ SAFMN2, SAFMX2 | ||||
| INTEGER COUNT | INTEGER COUNT | ||||
| * .. | * .. | ||||
| @@ -248,10 +248,14 @@ | |||||
| * | * | ||||
| * Compute [ A B ] = [ CS SN ] [ AA BB ] | * Compute [ A B ] = [ CS SN ] [ AA BB ] | ||||
| * [ C D ] [-SN CS ] [ CC DD ] | * [ C D ] [-SN CS ] [ CC DD ] | ||||
| * | |||||
| * Note: Some of the multiplications are wrapped in parentheses to | |||||
| * prevent compilers from using FMA instructions. See | |||||
| * https://github.com/Reference-LAPACK/lapack/issues/1031. | |||||
| * | * | ||||
| A = AA*CS + CC*SN | A = AA*CS + CC*SN | ||||
| B = BB*CS + DD*SN | |||||
| C = -AA*SN + CC*CS | |||||
| B = ( BB*CS ) + ( DD*SN ) | |||||
| C = -( AA*SN ) + ( CC*CS ) | |||||
| D = -BB*SN + DD*CS | D = -BB*SN + DD*CS | ||||
| * | * | ||||
| TEMP = HALF*( A+D ) | TEMP = HALF*( A+D ) | ||||
| @@ -18,7 +18,7 @@ | |||||
| *> | *> | ||||
| *> \verbatim | *> \verbatim | ||||
| *> | *> | ||||
| *> DGELQT computes a blocked LQ factorization of a real M-by-N matrix A | |||||
| *> SGELQT computes a blocked LQ factorization of a real M-by-N matrix A | |||||
| *> using the compact WY representation of Q. | *> using the compact WY representation of Q. | ||||
| *> \endverbatim | *> \endverbatim | ||||
| * | * | ||||
| @@ -93,7 +93,7 @@ | |||||
| *> \author Univ. of Colorado Denver | *> \author Univ. of Colorado Denver | ||||
| *> \author NAG Ltd. | *> \author NAG Ltd. | ||||
| * | * | ||||
| *> \ingroup doubleGEcomputational | |||||
| *> \ingroup gelqt | |||||
| * | * | ||||
| *> \par Further Details: | *> \par Further Details: | ||||
| * ===================== | * ===================== | ||||
| @@ -74,7 +74,7 @@ | |||||
| *> A is REAL array, dimension | *> A is REAL array, dimension | ||||
| *> (LDA,M) if SIDE = 'L', | *> (LDA,M) if SIDE = 'L', | ||||
| *> (LDA,N) if SIDE = 'R' | *> (LDA,N) if SIDE = 'R' | ||||
| *> Part of the data structure to represent Q as returned by DGELQ. | |||||
| *> Part of the data structure to represent Q as returned by SGELQ. | |||||
| *> \endverbatim | *> \endverbatim | ||||
| *> | *> | ||||
| *> \param[in] LDA | *> \param[in] LDA | ||||
| @@ -20,7 +20,7 @@ | |||||
| *> | *> | ||||
| *> \verbatim | *> \verbatim | ||||
| *> | *> | ||||
| *> DGEMLQT overwrites the general real M-by-N matrix C with | |||||
| *> SGEMLQT overwrites the general real M-by-N matrix C with | |||||
| *> | *> | ||||
| *> SIDE = 'L' SIDE = 'R' | *> SIDE = 'L' SIDE = 'R' | ||||
| *> TRANS = 'N': Q C C Q | *> TRANS = 'N': Q C C Q | ||||
| @@ -145,7 +145,7 @@ | |||||
| *> \author Univ. of Colorado Denver | *> \author Univ. of Colorado Denver | ||||
| *> \author NAG Ltd. | *> \author NAG Ltd. | ||||
| * | * | ||||
| *> \ingroup doubleGEcomputational | |||||
| *> \ingroup gemlqt | |||||
| * | * | ||||
| * ===================================================================== | * ===================================================================== | ||||
| SUBROUTINE SGEMLQT( SIDE, TRANS, M, N, K, MB, V, LDV, T, LDT, | SUBROUTINE SGEMLQT( SIDE, TRANS, M, N, K, MB, V, LDV, T, LDT, | ||||
| @@ -109,7 +109,7 @@ | |||||
| *> \author Univ. of Colorado Denver | *> \author Univ. of Colorado Denver | ||||
| *> \author NAG Ltd. | *> \author NAG Ltd. | ||||
| * | * | ||||
| *> \ingroup realOTHERauxiliary | |||||
| *> \ingroup lanv2 | |||||
| * | * | ||||
| *> \par Further Details: | *> \par Further Details: | ||||
| * ===================== | * ===================== | ||||
| @@ -144,7 +144,7 @@ | |||||
| * .. | * .. | ||||
| * .. Local Scalars .. | * .. Local Scalars .. | ||||
| REAL AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, | REAL AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, | ||||
| $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, | |||||
| $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, | |||||
| $ SAFMN2, SAFMX2 | $ SAFMN2, SAFMX2 | ||||
| INTEGER COUNT | INTEGER COUNT | ||||
| * .. | * .. | ||||
| @@ -248,10 +248,14 @@ | |||||
| * | * | ||||
| * Compute [ A B ] = [ CS SN ] [ AA BB ] | * Compute [ A B ] = [ CS SN ] [ AA BB ] | ||||
| * [ C D ] [-SN CS ] [ CC DD ] | * [ C D ] [-SN CS ] [ CC DD ] | ||||
| * | |||||
| * Note: Some of the multiplications are wrapped in parentheses to | |||||
| * prevent compilers from using FMA instructions. See | |||||
| * https://github.com/Reference-LAPACK/lapack/issues/1031. | |||||
| * | * | ||||
| A = AA*CS + CC*SN | A = AA*CS + CC*SN | ||||
| B = BB*CS + DD*SN | |||||
| C = -AA*SN + CC*CS | |||||
| B = ( BB*CS ) + ( DD*SN ) | |||||
| C = -( AA*SN ) + ( CC*CS ) | |||||
| D = -BB*SN + DD*CS | D = -BB*SN + DD*CS | ||||
| * | * | ||||
| TEMP = HALF*( A+D ) | TEMP = HALF*( A+D ) | ||||
| @@ -188,7 +188,8 @@ | |||||
| *> \endverbatim | *> \endverbatim | ||||
| *> | *> | ||||
| * ===================================================================== | * ===================================================================== | ||||
| SUBROUTINE SSYTRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO ) | |||||
| SUBROUTINE SSYTRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, | |||||
| $ INFO ) | |||||
| * | * | ||||
| * -- LAPACK computational routine -- | * -- LAPACK computational routine -- | ||||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | * -- LAPACK is a software package provided by Univ. of Tennessee, -- | ||||
| @@ -248,7 +249,7 @@ | |||||
| * Determine the block size. | * Determine the block size. | ||||
| * | * | ||||
| NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) | NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) | ||||
| LWKOPT = N*NB | |||||
| LWKOPT = MAX( 1, N*NB ) | |||||
| WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) | WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) | ||||
| END IF | END IF | ||||
| * | * | ||||
| @@ -316,7 +317,8 @@ | |||||
| * Update the unreduced submatrix A(1:i-1,1:i-1), using an | * Update the unreduced submatrix A(1:i-1,1:i-1), using an | ||||
| * update of the form: A := A - V*W**T - W*V**T | * update of the form: A := A - V*W**T - W*V**T | ||||
| * | * | ||||
| CALL SSYR2K( UPLO, 'No transpose', I-1, NB, -ONE, A( 1, I ), | |||||
| CALL SSYR2K( UPLO, 'No transpose', I-1, NB, -ONE, A( 1, | |||||
| $ I ), | |||||
| $ LDA, WORK, LDWORK, ONE, A, LDA ) | $ LDA, WORK, LDWORK, ONE, A, LDA ) | ||||
| * | * | ||||
| * Copy superdiagonal elements back into A, and diagonal | * Copy superdiagonal elements back into A, and diagonal | ||||
| @@ -139,7 +139,7 @@ | |||||
| *> \author Univ. of Colorado Denver | *> \author Univ. of Colorado Denver | ||||
| *> \author NAG Ltd. | *> \author NAG Ltd. | ||||
| * | * | ||||
| *> \ingroup complex16HEcomputational | |||||
| *> \ingroup hetrd | |||||
| * | * | ||||
| *> \par Further Details: | *> \par Further Details: | ||||
| * ===================== | * ===================== | ||||
| @@ -188,7 +188,8 @@ | |||||
| *> \endverbatim | *> \endverbatim | ||||
| *> | *> | ||||
| * ===================================================================== | * ===================================================================== | ||||
| SUBROUTINE ZHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO ) | |||||
| SUBROUTINE ZHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, | |||||
| $ INFO ) | |||||
| * | * | ||||
| * -- LAPACK computational routine -- | * -- LAPACK computational routine -- | ||||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | * -- LAPACK is a software package provided by Univ. of Tennessee, -- | ||||
| @@ -249,7 +250,7 @@ | |||||
| * Determine the block size. | * Determine the block size. | ||||
| * | * | ||||
| NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) | NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) | ||||
| LWKOPT = N*NB | |||||
| LWKOPT = MAX( 1, N*NB ) | |||||
| WORK( 1 ) = LWKOPT | WORK( 1 ) = LWKOPT | ||||
| END IF | END IF | ||||
| * | * | ||||
| @@ -189,8 +189,11 @@ endif | |||||
| endif | endif | ||||
| ifeq ($(SUPPORT_GEMM3M),1) | |||||
| level3: $(B3) $(S3) $(D3) $(C3) $(Z3) level3_3m | |||||
| else | |||||
| level3: $(B3) $(S3) $(D3) $(C3) $(Z3) | level3: $(B3) $(S3) $(D3) $(C3) $(Z3) | ||||
| endif | |||||
| ifneq ($(CROSS), 1) | ifneq ($(CROSS), 1) | ||||
| rm -f ?BLAT3.SUMM | rm -f ?BLAT3.SUMM | ||||
| @@ -263,7 +266,7 @@ endif | |||||
| endif | endif | ||||
| level3_3m : zblat3_3m cblat3_3m | |||||
| level3_3m: zblat3_3m cblat3_3m | |||||
| ifneq ($(CROSS), 1) | ifneq ($(CROSS), 1) | ||||
| rm -f ?BLAT3_3M.SUMM | rm -f ?BLAT3_3M.SUMM | ||||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat | OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat | ||||