Merge from develop branch for 0.3.23tags/v0.3.23
| @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) | |||||
| set(OpenBLAS_MAJOR_VERSION 0) | set(OpenBLAS_MAJOR_VERSION 0) | ||||
| set(OpenBLAS_MINOR_VERSION 3) | set(OpenBLAS_MINOR_VERSION 3) | ||||
| set(OpenBLAS_PATCH_VERSION 22) | |||||
| set(OpenBLAS_PATCH_VERSION 22.dev) | |||||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | ||||
| @@ -1,4 +1,21 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.3.23 | |||||
| 01-Apr-2023 | |||||
| general: | |||||
| - fixed a serious regression in GETRF/GETF2 and ZGETRF/ZGETF2 where | |||||
| subnormal but nonzero data elements triggered the singularity flag | |||||
| - fixed a long-standing bug in CSPR/ZSPR in single-threaded operation | |||||
| for cases where elements of the X vector are real numbers (or | |||||
| complex with only the real part zero) | |||||
| - fixed gmake builds with the option NO_LAPACK | |||||
| - fixed a few instances in the gmake Makefiles where expressly | |||||
| setting NO_LAPACK=0 or NO_LAPACKE=0 would have the opposite effect | |||||
| x86_64: | |||||
| - added further CPUID values for Intel Raptor Lake | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.3.22 | Version 0.3.22 | ||||
| 26-Mar-2023 | 26-Mar-2023 | ||||
| @@ -77,7 +77,7 @@ endif | |||||
| endif | endif | ||||
| ifneq ($(OSNAME), AIX) | ifneq ($(OSNAME), AIX) | ||||
| ifndef NO_LAPACKE | |||||
| ifneq ($(NO_LAPACKE), 1) | |||||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | ||||
| @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" | @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" | ||||
| @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | ||||
| @@ -127,7 +127,7 @@ endif | |||||
| else | else | ||||
| #install on AIX has different options syntax | #install on AIX has different options syntax | ||||
| ifndef NO_LAPACKE | |||||
| ifneq ($(NO_LAPACKE), 1) | |||||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | ||||
| @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" | @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" | ||||
| @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | @-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.3.22 | |||||
| VERSION = 0.3.22.dev | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -1547,6 +1547,8 @@ int get_cpuname(void){ | |||||
| case 11: //family 6 exmodel 11 | case 11: //family 6 exmodel 11 | ||||
| switch (model) { | switch (model) { | ||||
| case 7: // Raptor Lake | case 7: // Raptor Lake | ||||
| case 10: | |||||
| case 15: | |||||
| if(support_avx2()) | if(support_avx2()) | ||||
| return CPUTYPE_HASWELL; | return CPUTYPE_HASWELL; | ||||
| if(support_avx()) | if(support_avx()) | ||||
| @@ -2348,6 +2350,8 @@ int get_coretype(void){ | |||||
| case 11: | case 11: | ||||
| switch (model) { | switch (model) { | ||||
| case 7: // Raptor Lake | case 7: // Raptor Lake | ||||
| case 10: | |||||
| case 15: | |||||
| #ifndef NO_AVX2 | #ifndef NO_AVX2 | ||||
| if(support_avx2()) | if(support_avx2()) | ||||
| return CORE_HASWELL; | return CORE_HASWELL; | ||||
| @@ -92,7 +92,7 @@ CBLASOBJS += \ | |||||
| ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \ | ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \ | ||||
| ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX) | ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX) | ||||
| ifndef NO_LAPACK | |||||
| ifneq ($(NO_LAPACK), 1) | |||||
| CBLASOBJS += \ | CBLASOBJS += \ | ||||
| cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ | cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ | ||||
| cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) \ | cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) \ | ||||
| @@ -53,7 +53,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, | |||||
| for (i = 0; i < m; i++){ | for (i = 0; i < m; i++){ | ||||
| #ifndef LOWER | #ifndef LOWER | ||||
| if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) { | |||||
| if ((X[i * 2 + 0] != ZERO) || (X[i * 2 + 1] != ZERO)) { | |||||
| AXPYU_K(i + 1, 0, 0, | AXPYU_K(i + 1, 0, 0, | ||||
| alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], | alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], | ||||
| alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], | alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], | ||||
| @@ -61,7 +61,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, | |||||
| } | } | ||||
| a += (i + 1) * 2; | a += (i + 1) * 2; | ||||
| #else | #else | ||||
| if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) { | |||||
| if ((X[i * 2 + 0] != ZERO) || (X[i * 2 + 1] != ZERO)) { | |||||
| AXPYU_K(m - i, 0, 0, | AXPYU_K(m - i, 0, 0, | ||||
| alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], | alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], | ||||
| alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], | alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], | ||||
| @@ -92,9 +92,8 @@ CBLAS2OBJS = \ | |||||
| cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ | cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ | ||||
| ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ | ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ | ||||
| csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ | csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ | ||||
| csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ | |||||
| cspr.$(SUFFIX) cspr2.$(SUFFIX) \ | |||||
| csymv.$(SUFFIX) csyr.$(SUFFIX) \ | |||||
| csbmv.$(SUFFIX) \ | |||||
| cspr2.$(SUFFIX) \ | |||||
| ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ | ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ | ||||
| ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ | ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ | ||||
| chemv.$(SUFFIX) chbmv.$(SUFFIX) \ | chemv.$(SUFFIX) chbmv.$(SUFFIX) \ | ||||
| @@ -122,9 +121,8 @@ ZBLAS2OBJS = \ | |||||
| zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ | zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ | ||||
| ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ | ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ | ||||
| zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ | zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ | ||||
| zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ | |||||
| zspr.$(SUFFIX) zspr2.$(SUFFIX) \ | |||||
| zsymv.$(SUFFIX) zsyr.$(SUFFIX) \ | |||||
| zsbmv.$(SUFFIX) \ | |||||
| zspr2.$(SUFFIX) \ | |||||
| ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ | ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ | ||||
| ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ | ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ | ||||
| zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ | zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ | ||||
| @@ -447,7 +445,8 @@ QLAPACKOBJS = \ | |||||
| CLAPACKOBJS = \ | CLAPACKOBJS = \ | ||||
| cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ | cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ | ||||
| cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ | cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ | ||||
| clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) | |||||
| clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) \ | |||||
| cspr.$(SUFFIX) cspmv.$(SUFFIX) csymv.$(SUFFIX) csyr.$(SUFFIX) | |||||
| #ZLAPACKOBJS = \ | #ZLAPACKOBJS = \ | ||||
| # zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ | # zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ | ||||
| @@ -458,8 +457,8 @@ CLAPACKOBJS = \ | |||||
| ZLAPACKOBJS = \ | ZLAPACKOBJS = \ | ||||
| zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ | zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ | ||||
| zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ | zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ | ||||
| zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) | |||||
| zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) \ | |||||
| zspr.$(SUFFIX) zspmv.$(SUFFIX) zsymv.$(SUFFIX) zsyr.$(SUFFIX) | |||||
| XLAPACKOBJS = \ | XLAPACKOBJS = \ | ||||
| xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ | xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ | ||||
| @@ -1021,7 +1020,7 @@ dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c | |||||
| qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c | qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| ifndef NO_LAPACK | |||||
| ifneq ($(NO_LAPACK), 1) | |||||
| csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c | csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| @@ -1041,7 +1040,7 @@ dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c | |||||
| qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c | qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| ifndef NO_LAPACK | |||||
| ifneq ($(NO_LAPACK), 1) | |||||
| csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c | csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| @@ -1115,7 +1114,7 @@ dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c | |||||
| qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c | qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| ifndef NO_LAPACK | |||||
| ifneq ($(NO_LAPACK), 1) | |||||
| cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c | cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| @@ -1135,7 +1134,7 @@ dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c | |||||
| qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c | qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| ifndef NO_LAPACK | |||||
| ifneq ($(NO_LAPACK), 1) | |||||
| cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c | cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| @@ -100,16 +100,21 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||||
| jp--; | jp--; | ||||
| temp1 = *(b + jp); | temp1 = *(b + jp); | ||||
| //if (temp1 != ZERO) { | |||||
| if (temp1 != ZERO) { | |||||
| #if defined(DOUBLE) | |||||
| if (fabs(temp1) >= DBL_MIN ) { | if (fabs(temp1) >= DBL_MIN ) { | ||||
| temp1 = dp1 / temp1; | |||||
| if (jp != j) { | |||||
| SWAP_K(j + 1, 0, 0, ZERO, a + j, lda, a + jp, lda, NULL, 0); | |||||
| } | |||||
| if (j + 1 < m) { | |||||
| SCAL_K(m - j - 1, 0, 0, temp1, b + j + 1, 1, NULL, 0, NULL, 0); | |||||
| } | |||||
| #else | |||||
| if (fabs(temp1) >= FLT_MIN ) { | |||||
| #endif | |||||
| temp1 = dp1 / temp1; | |||||
| if (jp != j) { | |||||
| SWAP_K(j + 1, 0, 0, ZERO, a + j, lda, a + jp, lda, NULL, 0); | |||||
| } | |||||
| if (j + 1 < m) { | |||||
| SCAL_K(m - j - 1, 0, 0, temp1, b + j + 1, 1, NULL, 0, NULL, 0); | |||||
| } | |||||
| } | |||||
| } else { | } else { | ||||
| if (!info) info = j + 1; | if (!info) info = j + 1; | ||||
| } | } | ||||
| @@ -106,30 +106,34 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||||
| temp1 = *(b + jp * 2 + 0); | temp1 = *(b + jp * 2 + 0); | ||||
| temp2 = *(b + jp * 2 + 1); | temp2 = *(b + jp * 2 + 1); | ||||
| // if ((temp1 != ZERO) || (temp2 != ZERO)) { | |||||
| if ((fabs(temp1) >= DBL_MIN) && (fabs(temp2) >= DBL_MIN)) { | |||||
| if (jp != j) { | |||||
| SWAP_K(j + 1, 0, 0, ZERO, ZERO, a + j * 2, lda, | |||||
| if ((temp1 != ZERO) || (temp2 != ZERO)) { | |||||
| #if defined(DOUBLE) | |||||
| if ((fabs(temp1) >= DBL_MIN) || (fabs(temp2) >= DBL_MIN)) { | |||||
| #else | |||||
| if ((fabs(temp1) >= FLT_MIN) || (fabs(temp2) >= FLT_MIN)) { | |||||
| #endif | |||||
| if (jp != j) { | |||||
| SWAP_K(j + 1, 0, 0, ZERO, ZERO, a + j * 2, lda, | |||||
| a + jp * 2, lda, NULL, 0); | a + jp * 2, lda, NULL, 0); | ||||
| } | |||||
| if (fabs(temp1) >= fabs(temp2)){ | |||||
| ratio = temp2 / temp1; | |||||
| den = dp1 /(temp1 * ( 1 + ratio * ratio)); | |||||
| temp3 = den; | |||||
| temp4 = -ratio * den; | |||||
| } else { | |||||
| ratio = temp1 / temp2; | |||||
| den = dp1 /(temp2 * ( 1 + ratio * ratio)); | |||||
| temp3 = ratio * den; | |||||
| temp4 = -den; | |||||
| } | |||||
| if (j + 1 < m) { | |||||
| SCAL_K(m - j - 1, 0, 0, temp3, temp4, | |||||
| b + (j + 1) * 2, 1, NULL, 0, NULL, 0); | |||||
| } | |||||
| } | |||||
| if (fabs(temp1) >= fabs(temp2)){ | |||||
| ratio = temp2 / temp1; | |||||
| den = dp1 /(temp1 * ( 1 + ratio * ratio)); | |||||
| temp3 = den; | |||||
| temp4 = -ratio * den; | |||||
| } else { | |||||
| ratio = temp1 / temp2; | |||||
| den = dp1 /(temp2 * ( 1 + ratio * ratio)); | |||||
| temp3 = ratio * den; | |||||
| temp4 = -den; | |||||
| } | |||||
| if (j + 1 < m) { | |||||
| SCAL_K(m - j - 1, 0, 0, temp3, temp4, | |||||
| b + (j + 1) * 2, 1, NULL, 0, NULL, 0); | |||||
| } | |||||
| } | |||||
| } else { | } else { | ||||
| if (!info) info = j + 1; | if (!info) info = j + 1; | ||||
| } | } | ||||