From bfaf5b9ea442633ca5e3c6968c375b933b1794ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Villemot?= <sebastien@debian.org>
Date: Wed, 20 Nov 2024 11:41:52 +0100
Subject: [PATCH 1/5] Restore libsuffix support in pkg-config file

It had been mistakenly removed in 9ef10ffa496b919c25aedbb4aa2fdb930901475a.
---
 openblas.pc.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/openblas.pc.in b/openblas.pc.in
index d9bb84549..7632645ac 100644
--- a/openblas.pc.in
+++ b/openblas.pc.in
@@ -2,6 +2,6 @@ Name: openblas
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: ${version}
 URL: https://github.com/xianyi/OpenBLAS
-Libs: -L${libdir} -l${libprefix}openblas${libnamesuffix}
+Libs: -L${libdir} -l${libprefix}openblas${libnamesuffix}${libsuffix}
 Libs.private: ${extralib}
 Cflags: -I${includedir} ${omp_opt}

From fff2e214caee6e516ba1e49de81e9044d46b5a2e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 30 Dec 2024 23:05:17 +0100
Subject: [PATCH 2/5] Add LAPACK-TEST errors topic

---
 docs/faq.md | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/docs/faq.md b/docs/faq.md
index 699042d51..1a3505ca9 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -51,9 +51,9 @@ In practice, the values are derived by experimentation to yield the block sizes
 
 ### <a name="reportbug"></a>How can I report a bug?
 
-Please file an issue at this [issue page](https://github.com/xianyi/OpenBLAS/issues) or send mail to the [OpenBLAS mailing list](https://groups.google.com/forum/#!forum/openblas-users).
+Please file an issue at this [issue page](https://github.com/OpenMathLib/OpenBLAS/issues) or send mail to the [OpenBLAS mailing list](https://groups.google.com/forum/#!forum/openblas-users).
 
-Please provide the following information: CPU, OS, compiler, and OpenBLAS compiling flags (Makefile.rule). In addition, please describe how to reproduce this bug.
+Please provide the following information: CPU, OS, compiler, OpenBLAS version and any compiling flags you used (Makefile.rule). In addition, please describe how to reproduce this bug.
 
 ### <a name="publication"></a>How to reference OpenBLAS.
 
@@ -105,7 +105,7 @@ Please read [this page](install.md#visual-studio).
 
 Zaheer has fixed this bug. You can now use the structure instead of C99 complex numbers. Please read [this issue page](http://github.com/xianyi/OpenBLAS/issues/95) for details.
 
-[This issue](https://github.com/xianyi/OpenBLAS/issues/305) is for using LAPACKE in Visual Studio.
+[This issue](https://github.com/OpenMathLib/OpenBLAS/issues/305) is for using LAPACKE in Visual Studio.
 
 ### <a name="Linux_SEGFAULT"></a>I get a SEGFAULT with multi-threading on Linux. What's wrong?
 
@@ -134,6 +134,13 @@ Background: OpenBLAS implements optimized versions of some LAPACK functions, so
 Some of the LAPACK tests, notably in xeigtstz, try to allocate around 10MB on the stack. You may need to use
 `ulimit -s` to change the default limits on your system to allow this.
 
+### <a name="lapack_test"></a>My build worked fine and passed the BLAS tests, but running `make lapack-test` ends with a number of errors in the summary report
+
+The LAPACK tests were primarily created to test the validity of the Reference-LAPACK implementation, which is implemented in unoptimized, single-threaded Fortran code. This makes it very sensitive to small numerical deviations that can result from the use of specialized cpu instructions that combine multiplications and additions without intermediate rounding and storing to memory (FMA), or from changing the order of mathematical operations by splitting an original problem workload into smaller tasks that are solved in parallel. As a result, you may encounter a small number of errors in the "numerical" column of
+the summary table at the end of the `make lapack-test` run - this is usually nothing to worry about, and the exact number and distribution of errors among the
+four data types will often vary with the optimization flags you supplied to the compiler, or the cpu model for which you built OpenBLAS. Sporadic errors in the column labeled `other` are normally the sign of failed convergence of iterative diagonalizations for the same reasons just mentioned. A more detailed error report is stored in the file testing_results.txt - this should be consulted in case of doubt. Care should be taken if you encounter numerical errors in the hundreds, or `other` errors accompanied by the LAPACK error message "on entry to function_name parameter X had an illegal value" that signals a problem with argument passing between individual functions.
+(See also [this issue](https://github.com/OpenMathLib/OpenBLAS/issues/4032) in the issue tracker on github for additional discussion, examples and links)
+
 ### <a name="no_affinity"></a>How could I disable OpenBLAS threading affinity on runtime?
 
 You can define the OPENBLAS_MAIN_FREE or GOTOBLAS_MAIN_FREE environment variable to disable threading affinity on runtime. For example, before the running,

From c37509c213a34a8cae449ededd7bc7064675ecc4 Mon Sep 17 00:00:00 2001
From: "tingbo.liao" <tingbo.liao@starfivetech.com>
Date: Tue, 31 Dec 2024 08:46:55 +0800
Subject: [PATCH 3/5] Optimize the nrm2_rvv function to further improve
 performance.

Signed-off-by: tingbo.liao <tingbo.liao@starfivetech.com>
---
 kernel/riscv64/nrm2_rvv.c | 370 +++++++++++++++++++++-----------------
 1 file changed, 204 insertions(+), 166 deletions(-)

diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c
index 14ed68b0a..472b1148e 100644
--- a/kernel/riscv64/nrm2_rvv.c
+++ b/kernel/riscv64/nrm2_rvv.c
@@ -27,185 +27,223 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(DOUBLE)
-#define VSETVL             __riscv_vsetvl_e64m4
-#define FLOAT_V_T           vfloat64m4_t
-#define FLOAT_V_T_M1        vfloat64m1_t
-#define VLEV_FLOAT          __riscv_vle64_v_f64m4
-#define VLSEV_FLOAT         __riscv_vlse64_v_f64m4
-#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f64m4
-#define VFMVSF_FLOAT        __riscv_vfmv_s_f_f64m4
-#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f64m1
-#define MASK_T              vbool16_t
-#define VFABS               __riscv_vfabs_v_f64m4
-#define VMFNE               __riscv_vmfne_vf_f64m4_b16
-#define VMFGT               __riscv_vmfgt_vv_f64m4_b16
-#define VMFEQ               __riscv_vmfeq_vf_f64m4_b16
-#define VCPOP               __riscv_vcpop_m_b16
-#define VFREDMAX            __riscv_vfredmax_vs_f64m4_f64m1
-#define VFREDMIN            __riscv_vfredmin_vs_f64m4_f64m1
-#define VFIRST              __riscv_vfirst_m_b16
-#define VRGATHER            __riscv_vrgather_vx_f64m4
-#define VFDIV               __riscv_vfdiv_vv_f64m4
-#define VFDIV_M             __riscv_vfdiv_vv_f64m4_mu
-#define VFMUL               __riscv_vfmul_vv_f64m4
-#define VFMUL_M             __riscv_vfmul_vv_f64m4_mu
-#define VFMACC              __riscv_vfmacc_vv_f64m4
-#define VFMACC_M            __riscv_vfmacc_vv_f64m4_mu
-#define VMSBF               __riscv_vmsbf_m_b16
-#define VMSOF               __riscv_vmsof_m_b16
-#define VMAND               __riscv_vmand_mm_b16
-#define VMANDN              __riscv_vmand_mm_b16
-#define VFREDSUM            __riscv_vfredusum_vs_f64m4_f64m1
-#define VMERGE              __riscv_vmerge_vvm_f64m4
-#define VSEV_FLOAT          __riscv_vse64_v_f64m4
-#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f64m4_f64(v)
-#define ABS fabs
-#else
-#define VSETVL              __riscv_vsetvl_e32m4
+#if !defined(DOUBLE)
+#define VSETVL(n)           __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX          __riscv_vsetvlmax_e32m4()
 #define FLOAT_V_T           vfloat32m4_t
 #define FLOAT_V_T_M1        vfloat32m1_t
+#define MASK_T              vbool8_t
 #define VLEV_FLOAT          __riscv_vle32_v_f32m4
 #define VLSEV_FLOAT         __riscv_vlse32_v_f32m4
+#define VFREDSUM_FLOAT      __riscv_vfredusum_vs_f32m4_f32m1_tu
+#define VFMACCVV_FLOAT_TU   __riscv_vfmacc_vv_f32m4_tu
 #define VFMVVF_FLOAT        __riscv_vfmv_v_f_f32m4
-#define VFMVSF_FLOAT        __riscv_vfmv_s_f_f32m4
 #define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f32m1
-#define MASK_T              vbool8_t
-#define VFABS               __riscv_vfabs_v_f32m4
-#define VMFNE               __riscv_vmfne_vf_f32m4_b8
-#define VMFGT               __riscv_vmfgt_vv_f32m4_b8
-#define VMFEQ               __riscv_vmfeq_vf_f32m4_b8
-#define VCPOP               __riscv_vcpop_m_b8
-#define VFREDMAX            __riscv_vfredmax_vs_f32m4_f32m1
-#define VFREDMIN            __riscv_vfredmin_vs_f32m4_f32m1
-#define VFIRST              __riscv_vfirst_m_b8
-#define VRGATHER            __riscv_vrgather_vx_f32m4
-#define VFDIV               __riscv_vfdiv_vv_f32m4
-#define VFDIV_M             __riscv_vfdiv_vv_f32m4_mu
-#define VFMUL               __riscv_vfmul_vv_f32m4
-#define VFMUL_M             __riscv_vfmul_vv_f32m4_mu
-#define VFMACC              __riscv_vfmacc_vv_f32m4
-#define VFMACC_M            __riscv_vfmacc_vv_f32m4_mu
-#define VMSBF               __riscv_vmsbf_m_b8
-#define VMSOF               __riscv_vmsof_m_b8
-#define VMAND               __riscv_vmand_mm_b8
-#define VMANDN              __riscv_vmand_mm_b8
-#define VFREDSUM            __riscv_vfredusum_vs_f32m4_f32m1
-#define VMERGE              __riscv_vmerge_vvm_f32m4
-#define VSEV_FLOAT          __riscv_vse32_v_f32m4
-#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f32m4_f32(v)
+#define VMFIRSTM            __riscv_vfirst_m_b8
+#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu
+#define VFMVFS_FLOAT        __riscv_vfmv_f_s_f32m1_f32
+#define VMFGTVF_FLOAT       __riscv_vmfgt_vf_f32m4_b8
+#define VFDIVVF_FLOAT       __riscv_vfdiv_vf_f32m4
+#define VFABSV_FLOAT        __riscv_vfabs_v_f32m4
 #define ABS fabsf
+#else
+#define VSETVL(n)           __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX          __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T           vfloat64m4_t
+#define FLOAT_V_T_M1        vfloat64m1_t
+#define MASK_T              vbool16_t
+#define VLEV_FLOAT          __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT         __riscv_vlse64_v_f64m4
+#define VFREDSUM_FLOAT      __riscv_vfredusum_vs_f64m4_f64m1_tu
+#define VFMACCVV_FLOAT_TU   __riscv_vfmacc_vv_f64m4_tu
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f64m1
+#define VMFIRSTM            __riscv_vfirst_m_b16
+#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu
+#define VFMVFS_FLOAT        __riscv_vfmv_f_s_f64m1_f64
+#define VMFGTVF_FLOAT       __riscv_vmfgt_vf_f64m4_b16
+#define VFDIVVF_FLOAT       __riscv_vfdiv_vf_f64m4
+#define VFABSV_FLOAT        __riscv_vfabs_v_f64m4
+#define ABS fabs
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0;
-
-	if (n <= 0 || inc_x == 0) return(0.0);
-        if(n == 1) return (ABS(x[0]));
-
-        unsigned int gvl = 0;
-
-        MASK_T nonzero_mask;
-        MASK_T scale_mask;
-
-        gvl = VSETVL(n);
-        FLOAT_V_T v0;
-        FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl);
-        FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl);
-
-        FLOAT scale = 0;
-        FLOAT ssq = 0;
-        unsigned int stride_x = inc_x * sizeof(FLOAT);
-        int idx = 0;
-
-        if( n >= gvl && inc_x > 0 ) // don't pay overheads if we're not doing useful work
-        {
-                for(i=0; i<n/gvl; i++){
-                        v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
-                        nonzero_mask = VMFNE( v0, 0, gvl );
-                        v0 = VFABS( v0, gvl );
-                        scale_mask = VMFGT( v0, v_scale, gvl );
-
-                        // assume scale changes are relatively infrequent
-
-                        // unclear if the vcpop+branch is actually a win
-                        // since the operations being skipped are predicated anyway
-                        // need profiling to confirm
-                        if( VCPOP(scale_mask, gvl) ) 
-                        {
-                                v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl );
-                                v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl );
-                                v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl );
-                                v_scale = VMERGE( v_scale, v0, scale_mask, gvl );
-                        }
-                        v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl );
-                        v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl );
-                        idx += inc_x * gvl;
-                }
-
-                // we have gvl elements which we accumulated independently, with independent scales
-                // we need to combine these
-                // naive sort so we process small values first to avoid losing information
-                // could use vector sort extensions where available, but we're dealing with gvl elts at most
-
-                FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT));
-                FLOAT * out_scale = alloca(gvl*sizeof(FLOAT));
-                VSEV_FLOAT( out_ssq, v_ssq, gvl );
-                VSEV_FLOAT( out_scale, v_scale, gvl );
-                for( int a = 0; a < (gvl-1); ++a )
-                {
-                        int smallest = a;
-                        for( size_t b = a+1; b < gvl; ++b )
-                                if( out_scale[b] < out_scale[smallest] )
-                                        smallest = b;
-                        if( smallest != a )
-                        {
-                                FLOAT tmp1 = out_ssq[a];
-                                FLOAT tmp2 = out_scale[a];
-                                out_ssq[a] = out_ssq[smallest];
-                                out_scale[a] = out_scale[smallest];
-                                out_ssq[smallest] = tmp1;
-                                out_scale[smallest] = tmp2;
-                        }
-                }
-
-                int a = 0;
-                while( a<gvl && out_scale[a] == 0 )
-                        ++a;
-
-                if( a < gvl ) 
-                {
-                        ssq = out_ssq[a];
-                        scale = out_scale[a];
-                        ++a;
-                        for( ; a < gvl; ++a ) 
-                        {
-                                ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a];
-                                scale = out_scale[a];
-                        }
-                }
-        }
-
-        //finish any tail using scalar ops
-        i*=gvl*inc_x;
-        n*=inc_x;
+    if (n <= 0 || inc_x == 0) return(0.0);
+    if ( n == 1 ) return( ABS(x[0]) );
+
+    BLASLONG i = 0, j = 0;
+	FLOAT scale = 0.0, ssq = 0.0;
+
+	if( inc_x > 0 ){
+		FLOAT_V_T vr, v0, v_zero;
+		unsigned int gvl = 0;
+		FLOAT_V_T_M1 v_res, v_z0;
+		gvl = VSETVL_MAX;
+		v_res = VFMVVF_FLOAT_M1(0, gvl);
+		v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+		MASK_T mask;
+		BLASLONG index = 0;
+
+		if (inc_x == 1) {
+			gvl = VSETVL(n);
+			vr = VFMVVF_FLOAT(0, gvl);
+			v_zero = VFMVVF_FLOAT(0, gvl);
+			for (i = 0, j = 0; i < n / gvl; i++) {
+				v0 = VLEV_FLOAT(&x[j], gvl);
+				// fabs(vector)
+				v0 = VFABSV_FLOAT(v0, gvl);
+				// if scale change
+				mask = VMFGTVF_FLOAT(v0, scale, gvl);
+				index = VMFIRSTM(mask, gvl);
+				if (index == -1) {	// no elements greater than scale
+					if (scale != 0.0) {
+						v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+						vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+					}
+				}
+				else {	// found greater element
+					// ssq in vector vr: vr[0]
+					v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+					// total ssq before current vector
+					ssq += VFMVFS_FLOAT(v_res);
+					// find max
+					v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+					// update ssq before max_index
+					ssq = ssq * (scale / VFMVFS_FLOAT(v_res)) * (scale / VFMVFS_FLOAT(v_res));
+					// update scale
+					scale = VFMVFS_FLOAT(v_res);
+					// ssq in vector vr
+					v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+					vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+				}
+				j += gvl;
+			}
+			// ssq in vector vr: vr[0]
+			v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+			// total ssq now
+			ssq += VFMVFS_FLOAT(v_res);
+
+			// tail processing
+			if(j < n){
+				gvl = VSETVL(n-j);
+				v0 = VLEV_FLOAT(&x[j], gvl);
+				// fabs(vector)
+				v0 = VFABSV_FLOAT(v0, gvl);
+				// if scale change
+				mask = VMFGTVF_FLOAT(v0, scale, gvl);
+				index = VMFIRSTM(mask, gvl);
+				if (index == -1) {	// no elements greater than scale
+					if(scale != 0.0)
+						v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+				} else {	// found greater element
+					// find max
+					v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+					// update ssq before max_index
+					ssq = ssq * (scale / VFMVFS_FLOAT(v_res))*(scale / VFMVFS_FLOAT(v_res));
+					// update scale
+					scale = VFMVFS_FLOAT(v_res);
+					v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+				}
+				vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+				// ssq in vector vr: vr[0]
+				v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+				// total ssq now
+				ssq += VFMVFS_FLOAT(v_res);
+			}
+		}
+		else {
+			gvl = VSETVL(n);
+			vr = VFMVVF_FLOAT(0, gvl);
+			v_zero = VFMVVF_FLOAT(0, gvl);
+			unsigned int stride_x = inc_x * sizeof(FLOAT);
+			int idx = 0, inc_v = inc_x * gvl;
+			for (i = 0, j = 0; i < n / gvl; i++) {
+				v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+				// fabs(vector)
+				v0 = VFABSV_FLOAT(v0, gvl);
+				// if scale change
+				mask = VMFGTVF_FLOAT(v0, scale, gvl);
+				index = VMFIRSTM(mask, gvl);
+				if (index == -1) {// no elements greater than scale
+					if(scale != 0.0){
+						v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+						vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+					}
+				}
+				else {	// found greater element
+					// ssq in vector vr: vr[0]
+					v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+					// total ssq before current vector
+					ssq += VFMVFS_FLOAT(v_res);
+					// find max
+					v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+					// update ssq before max_index
+					ssq = ssq * (scale / VFMVFS_FLOAT(v_res))*(scale / VFMVFS_FLOAT(v_res));
+					// update scale
+					scale = VFMVFS_FLOAT(v_res);
+					// ssq in vector vr
+					v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+					vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+				}
+				j += gvl;
+				idx += inc_v;
+			}
+			// ssq in vector vr: vr[0]
+			v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+			// total ssq now
+			ssq += VFMVFS_FLOAT(v_res);
+
+			// tail processing
+			if (j < n) {
+				gvl = VSETVL(n-j);
+				v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+				// fabs(vector)
+				v0 = VFABSV_FLOAT(v0, gvl);
+				// if scale change
+				mask = VMFGTVF_FLOAT(v0, scale, gvl);
+				index = VMFIRSTM(mask, gvl);
+				if(index == -1) {	// no elements greater than scale
+					if(scale != 0.0) {
+						v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+						vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+					}
+				}
+				else {	// found greater element
+					// find max
+					v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+					// update ssq before max_index
+					ssq = ssq * (scale / VFMVFS_FLOAT(v_res))*(scale / VFMVFS_FLOAT(v_res));
+					// update scale
+					scale = VFMVFS_FLOAT(v_res);
+					v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+					vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+				}
+				// ssq in vector vr: vr[0]
+				v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+				// total ssq now
+				ssq += VFMVFS_FLOAT(v_res);
+			}
+		}
+	}
+	else{
+        // using scalar ops when inc_x < 0
+        n *= inc_x;
         while(abs(i) < abs(n)){
-                if ( x[i] != 0.0 ){
-                        FLOAT absxi = ABS( x[i] );
-                        if ( scale < absxi ){
-                                ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
-                                scale = absxi ;
-                        }
-                        else{
-                                ssq += ( absxi/scale ) * ( absxi/scale );
-                        }
-
-                }
-
-                i += inc_x;
+			if ( x[i] != 0.0 ){
+				FLOAT absxi = ABS( x[i] );
+				if ( scale < absxi ){
+					ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+					scale = absxi ;
+				}
+				else{
+					ssq += ( absxi/scale ) * ( absxi/scale );
+				}
+
+			}
+			i += inc_x;
         }
-
+	}
 	return(scale * sqrt(ssq));
 }
 

From 6ad793d65ec1e5e733e3c2e2327793cc1d3b8360 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 31 Dec 2024 14:34:55 +0100
Subject: [PATCH 4/5] Fix naming of suffixed libraries in the cmake and
 pkgconfig files

---
 Makefile.install | 4 ++--
 openblas.pc.in   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile.install b/Makefile.install
index bfed157a4..486e9233e 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -191,13 +191,13 @@ endif
 #Generating OpenBLASConfig.cmake
 	@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
 	@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
-	@echo "file(REAL_PATH \"../../..\" _OpenBLAS_ROOT_DIR BASE_DIRECTORY \$${CMAKE_CURRENT_LIST_DIR} )" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
+	@echo "file(REAL_PATH \"../../..\" _OpenBLAS_ROOT_DIR BASE_DIRECTORY \$${CMAKE_CURRENT_LIST_DIR} )" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "SET(OpenBLAS_INCLUDE_DIRS \$${_OpenBLAS_ROOT_DIR}/include)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 
 ifneq ($(NO_SHARED),1)
 #ifeq logical or
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
-	@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
+	@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 endif
 ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
 	@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/bin/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
diff --git a/openblas.pc.in b/openblas.pc.in
index 7632645ac..fe2f08720 100644
--- a/openblas.pc.in
+++ b/openblas.pc.in
@@ -2,6 +2,6 @@ Name: openblas
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: ${version}
 URL: https://github.com/xianyi/OpenBLAS
-Libs: -L${libdir} -l${libprefix}openblas${libnamesuffix}${libsuffix}
+Libs: -L${libdir} -l${libprefix}openblas${libsuffix}${libnamesuffix}
 Libs.private: ${extralib}
 Cflags: -I${includedir} ${omp_opt}

From e9ff70b3941d99ad101286629e0044f6de83daa5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 31 Dec 2024 15:55:13 +0100
Subject: [PATCH 5/5] Add an install_tests target to facilitate testing on
 cross-compiled targets

---
 Makefile         |  3 ++
 Makefile.install | 93 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/Makefile b/Makefile
index 78f82dea5..4c7217734 100644
--- a/Makefile
+++ b/Makefile
@@ -426,6 +426,9 @@ dummy :
 install :
 	$(MAKE) -f Makefile.install install
 
+install_tests :
+	$(MAKE) -f Makefile.install install_tests
+
 clean ::
 	@for d in $(SUBDIRS_ALL) ; \
 	do if test -d $$d; then \
diff --git a/Makefile.install b/Makefile.install
index 486e9233e..cd1dcdabc 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -227,3 +227,96 @@ endif
 	@echo "  endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo Install OK!
+
+install_tests : lib.grd
+ifneq ($(ONLY_CBLAS), 1)
+	@install -m 666 utest/openblas_utest $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 utest/openblas_utest_ext $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
+ifndef NO_FBLAS
+ifeq ($(BUILD_BFLOAT16),1)
+	@install -m 666 test/test_sbgemm $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+endif
+ifeq ($(BUILD_SINGLE),1)
+	@install -m 666 test/sblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/sblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/sblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/sblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/sblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+endif
+ifeq ($(BUILD_DOUBLE),1)
+	@install -m 666 test/dblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/dblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/dblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/dblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/dblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+endif
+ifeq ($(BUILD_COMPLEX),1)
+	@install -m 666 test/cblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/cblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/cblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/cblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/cblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS))
+	@install -m 666 test/cblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/cblat3_3m.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+endif
+endif
+ifeq ($(BUILD_COMPLEX16),1)
+	@install -m 666 test/zblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/zblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/zblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/zblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/zblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS))
+	@install -m 666 test/zblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 test/zblat3_3m.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+endif
+endif
+endif
+endif
+ifneq ($(ONLY_CBLAS), 1)
+ifeq ($(BUILD_SINGLE),1)
+	@install -m 666 ctest/xscblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/xscblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/xscblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/sin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/sin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+endif
+ifeq ($(BUILD_DOUBLE),1)
+	@install -m 666 ctest/xdcblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/xdcblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/xdcblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/din2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/din3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+endif
+ifeq ($(BUILD_COMPLEX),1)
+	@install -m 666 ctest/xccblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/xccblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/xccblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/cin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/cin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS))
+	@install -m 666 ctest/xccblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/cin3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+endif
+endif
+ifeq ($(BUILD_COMPLEX16),1)
+	@install -m 666 ctest/xzcblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/xzcblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/xzcblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/zin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/zin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS))
+	@install -m 666 ctest/xzcblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+	@install -m 666 ctest/zin3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+endif
+endif
+
+endif
+ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
+@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
+endif
+endif
+