Merge branch 'OpenMathLib:develop' into dev_rotm_1231

1 year ago · da8af30c3c
--- a/+ 3
+++ b/+ 3
@@ -426,6 +426,9 @@ dummy :
 install :
 	$(MAKE) -f Makefile.install install

 install_tests :
 	$(MAKE) -f Makefile.install install_tests

 clean ::
 	@for d in $(SUBDIRS_ALL) ; \
 	do if test -d $$d; then \
--- a/Makefile.install
+++ b/Makefile.install
@@ -191,13 +191,13 @@ endif
 #Generating OpenBLASConfig.cmake
 	@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
 	@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "file(REAL_PATH \"../../..\" _OpenBLAS_ROOT_DIR BASE_DIRECTORY \$${CMAKE_CURRENT_LIST_DIR} )" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "file(REAL_PATH \"../../..\" _OpenBLAS_ROOT_DIR BASE_DIRECTORY \$${CMAKE_CURRENT_LIST_DIR} )" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "SET(OpenBLAS_INCLUDE_DIRS \$${_OpenBLAS_ROOT_DIR}/include)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

 ifneq ($(NO_SHARED),1)
 #ifeq logical or
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
 	@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 endif
 ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
 	@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/bin/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@@ -227,3 +227,96 @@ endif
 	@echo "  endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo Install OK!

 install_tests : lib.grd
 ifneq ($(ONLY_CBLAS), 1)
 	@install -m 666 utest/openblas_utest $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 utest/openblas_utest_ext $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 ifndef NO_FBLAS
 ifeq ($(BUILD_BFLOAT16),1)
 	@install -m 666 test/test_sbgemm $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 ifeq ($(BUILD_SINGLE),1)
 	@install -m 666 test/sblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/sblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/sblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/sblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/sblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 ifeq ($(BUILD_DOUBLE),1)
 	@install -m 666 test/dblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/dblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/dblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/dblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/dblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 ifeq ($(BUILD_COMPLEX),1)
 	@install -m 666 test/cblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/cblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/cblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/cblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/cblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS))
 	@install -m 666 test/cblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/cblat3_3m.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 endif
 ifeq ($(BUILD_COMPLEX16),1)
 	@install -m 666 test/zblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/zblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/zblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/zblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/zblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS))
 	@install -m 666 test/zblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/zblat3_3m.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 endif
 endif
 endif
 ifneq ($(ONLY_CBLAS), 1)
 ifeq ($(BUILD_SINGLE),1)
 	@install -m 666 ctest/xscblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xscblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xscblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/sin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/sin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 ifeq ($(BUILD_DOUBLE),1)
 	@install -m 666 ctest/xdcblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xdcblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xdcblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/din2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/din3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 ifeq ($(BUILD_COMPLEX),1)
 	@install -m 666 ctest/xccblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xccblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xccblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/cin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/cin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS))
 	@install -m 666 ctest/xccblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/cin3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 endif
 ifeq ($(BUILD_COMPLEX16),1)
 	@install -m 666 ctest/xzcblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xzcblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xzcblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/zin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/zin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS))
 	@install -m 666 ctest/xzcblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/zin3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 endif

 endif
 ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 endif

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -51,9 +51,9 @@ In practice, the values are derived by experimentation to yield the block sizes

 ### <a name="reportbug"></a>How can I report a bug?

 Please file an issue at this [issue page](https://github.com/xianyi/OpenBLAS/issues) or send mail to the [OpenBLAS mailing list](https://groups.google.com/forum/#!forum/openblas-users).
 Please file an issue at this [issue page](https://github.com/OpenMathLib/OpenBLAS/issues) or send mail to the [OpenBLAS mailing list](https://groups.google.com/forum/#!forum/openblas-users).

 Please provide the following information: CPU, OS, compiler, and OpenBLAS compiling flags (Makefile.rule). In addition, please describe how to reproduce this bug.
 Please provide the following information: CPU, OS, compiler, OpenBLAS version and any compiling flags you used (Makefile.rule). In addition, please describe how to reproduce this bug.

 ### <a name="publication"></a>How to reference OpenBLAS.

@@ -105,7 +105,7 @@ Please read [this page](install.md#visual-studio).

 Zaheer has fixed this bug. You can now use the structure instead of C99 complex numbers. Please read [this issue page](http://github.com/xianyi/OpenBLAS/issues/95) for details.

 [This issue](https://github.com/xianyi/OpenBLAS/issues/305) is for using LAPACKE in Visual Studio.
 [This issue](https://github.com/OpenMathLib/OpenBLAS/issues/305) is for using LAPACKE in Visual Studio.

 ### <a name="Linux_SEGFAULT"></a>I get a SEGFAULT with multi-threading on Linux. What's wrong?

@@ -134,6 +134,13 @@ Background: OpenBLAS implements optimized versions of some LAPACK functions, so
 Some of the LAPACK tests, notably in xeigtstz, try to allocate around 10MB on the stack. You may need to use
 `ulimit -s` to change the default limits on your system to allow this.

 ### <a name="lapack_test"></a>My build worked fine and passed the BLAS tests, but running `make lapack-test` ends with a number of errors in the summary report

 The LAPACK tests were primarily created to test the validity of the Reference-LAPACK implementation, which is implemented in unoptimized, single-threaded Fortran code. This makes it very sensitive to small numerical deviations that can result from the use of specialized cpu instructions that combine multiplications and additions without intermediate rounding and storing to memory (FMA), or from changing the order of mathematical operations by splitting an original problem workload into smaller tasks that are solved in parallel. As a result, you may encounter a small number of errors in the "numerical" column of
 the summary table at the end of the `make lapack-test` run - this is usually nothing to worry about, and the exact number and distribution of errors among the
 four data types will often vary with the optimization flags you supplied to the compiler, or the cpu model for which you built OpenBLAS. Sporadic errors in the column labeled `other` are normally the sign of failed convergence of iterative diagonalizations for the same reasons just mentioned. A more detailed error report is stored in the file testing_results.txt - this should be consulted in case of doubt. Care should be taken if you encounter numerical errors in the hundreds, or `other` errors accompanied by the LAPACK error message "on entry to function_name parameter X had an illegal value" that signals a problem with argument passing between individual functions.
 (See also [this issue](https://github.com/OpenMathLib/OpenBLAS/issues/4032) in the issue tracker on github for additional discussion, examples and links)

 ### <a name="no_affinity"></a>How could I disable OpenBLAS threading affinity on runtime?

 You can define the OPENBLAS_MAIN_FREE or GOTOBLAS_MAIN_FREE environment variable to disable threading affinity on runtime. For example, before the running,
--- a/kernel/riscv64/nrm2_rvv.c
+++ b/kernel/riscv64/nrm2_rvv.c
@@ -27,185 +27,223 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include "common.h"

 #if defined(DOUBLE)
 #define VSETVL             __riscv_vsetvl_e64m4
 #define FLOAT_V_T           vfloat64m4_t
 #define FLOAT_V_T_M1        vfloat64m1_t
 #define VLEV_FLOAT          __riscv_vle64_v_f64m4
 #define VLSEV_FLOAT         __riscv_vlse64_v_f64m4
 #define VFMVVF_FLOAT        __riscv_vfmv_v_f_f64m4
 #define VFMVSF_FLOAT        __riscv_vfmv_s_f_f64m4
 #define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f64m1
 #define MASK_T              vbool16_t
 #define VFABS               __riscv_vfabs_v_f64m4
 #define VMFNE               __riscv_vmfne_vf_f64m4_b16
 #define VMFGT               __riscv_vmfgt_vv_f64m4_b16
 #define VMFEQ               __riscv_vmfeq_vf_f64m4_b16
 #define VCPOP               __riscv_vcpop_m_b16
 #define VFREDMAX            __riscv_vfredmax_vs_f64m4_f64m1
 #define VFREDMIN            __riscv_vfredmin_vs_f64m4_f64m1
 #define VFIRST              __riscv_vfirst_m_b16
 #define VRGATHER            __riscv_vrgather_vx_f64m4
 #define VFDIV               __riscv_vfdiv_vv_f64m4
 #define VFDIV_M             __riscv_vfdiv_vv_f64m4_mu
 #define VFMUL               __riscv_vfmul_vv_f64m4
 #define VFMUL_M             __riscv_vfmul_vv_f64m4_mu
 #define VFMACC              __riscv_vfmacc_vv_f64m4
 #define VFMACC_M            __riscv_vfmacc_vv_f64m4_mu
 #define VMSBF               __riscv_vmsbf_m_b16
 #define VMSOF               __riscv_vmsof_m_b16
 #define VMAND               __riscv_vmand_mm_b16
 #define VMANDN              __riscv_vmand_mm_b16
 #define VFREDSUM            __riscv_vfredusum_vs_f64m4_f64m1
 #define VMERGE              __riscv_vmerge_vvm_f64m4
 #define VSEV_FLOAT          __riscv_vse64_v_f64m4
 #define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f64m4_f64(v)
 #define ABS fabs
 #else
 #define VSETVL              __riscv_vsetvl_e32m4
 #if !defined(DOUBLE)
 #define VSETVL(n)           __riscv_vsetvl_e32m4(n)
 #define VSETVL_MAX          __riscv_vsetvlmax_e32m4()
 #define FLOAT_V_T           vfloat32m4_t
 #define FLOAT_V_T_M1        vfloat32m1_t
 #define MASK_T              vbool8_t
 #define VLEV_FLOAT          __riscv_vle32_v_f32m4
 #define VLSEV_FLOAT         __riscv_vlse32_v_f32m4
 #define VFREDSUM_FLOAT      __riscv_vfredusum_vs_f32m4_f32m1_tu
 #define VFMACCVV_FLOAT_TU   __riscv_vfmacc_vv_f32m4_tu
 #define VFMVVF_FLOAT        __riscv_vfmv_v_f_f32m4
 #define VFMVSF_FLOAT        __riscv_vfmv_s_f_f32m4
 #define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f32m1
 #define MASK_T              vbool8_t
 #define VFABS               __riscv_vfabs_v_f32m4
 #define VMFNE               __riscv_vmfne_vf_f32m4_b8
 #define VMFGT               __riscv_vmfgt_vv_f32m4_b8
 #define VMFEQ               __riscv_vmfeq_vf_f32m4_b8
 #define VCPOP               __riscv_vcpop_m_b8
 #define VFREDMAX            __riscv_vfredmax_vs_f32m4_f32m1
 #define VFREDMIN            __riscv_vfredmin_vs_f32m4_f32m1
 #define VFIRST              __riscv_vfirst_m_b8
 #define VRGATHER            __riscv_vrgather_vx_f32m4
 #define VFDIV               __riscv_vfdiv_vv_f32m4
 #define VFDIV_M             __riscv_vfdiv_vv_f32m4_mu
 #define VFMUL               __riscv_vfmul_vv_f32m4
 #define VFMUL_M             __riscv_vfmul_vv_f32m4_mu
 #define VFMACC              __riscv_vfmacc_vv_f32m4
 #define VFMACC_M            __riscv_vfmacc_vv_f32m4_mu
 #define VMSBF               __riscv_vmsbf_m_b8
 #define VMSOF               __riscv_vmsof_m_b8
 #define VMAND               __riscv_vmand_mm_b8
 #define VMANDN              __riscv_vmand_mm_b8
 #define VFREDSUM            __riscv_vfredusum_vs_f32m4_f32m1
 #define VMERGE              __riscv_vmerge_vvm_f32m4
 #define VSEV_FLOAT          __riscv_vse32_v_f32m4
 #define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f32m4_f32(v)
 #define VMFIRSTM            __riscv_vfirst_m_b8
 #define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu
 #define VFMVFS_FLOAT        __riscv_vfmv_f_s_f32m1_f32
 #define VMFGTVF_FLOAT       __riscv_vmfgt_vf_f32m4_b8
 #define VFDIVVF_FLOAT       __riscv_vfdiv_vf_f32m4
 #define VFABSV_FLOAT        __riscv_vfabs_v_f32m4
 #define ABS fabsf
 #else
 #define VSETVL(n)           __riscv_vsetvl_e64m4(n)
 #define VSETVL_MAX          __riscv_vsetvlmax_e64m4()
 #define FLOAT_V_T           vfloat64m4_t
 #define FLOAT_V_T_M1        vfloat64m1_t
 #define MASK_T              vbool16_t
 #define VLEV_FLOAT          __riscv_vle64_v_f64m4
 #define VLSEV_FLOAT         __riscv_vlse64_v_f64m4
 #define VFREDSUM_FLOAT      __riscv_vfredusum_vs_f64m4_f64m1_tu
 #define VFMACCVV_FLOAT_TU   __riscv_vfmacc_vv_f64m4_tu
 #define VFMVVF_FLOAT        __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f64m1
 #define VMFIRSTM            __riscv_vfirst_m_b16
 #define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu
 #define VFMVFS_FLOAT        __riscv_vfmv_f_s_f64m1_f64
 #define VMFGTVF_FLOAT       __riscv_vmfgt_vf_f64m4_b16
 #define VFDIVVF_FLOAT       __riscv_vfdiv_vf_f64m4
 #define VFABSV_FLOAT        __riscv_vfabs_v_f64m4
 #define ABS fabs
 #endif

 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;

 	if (n <= 0 || inc_x == 0) return(0.0);
        if(n == 1) return (ABS(x[0]));

        unsigned int gvl = 0;

        MASK_T nonzero_mask;
        MASK_T scale_mask;

        gvl = VSETVL(n);
        FLOAT_V_T v0;
        FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl);
        FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl);

        FLOAT scale = 0;
        FLOAT ssq = 0;
        unsigned int stride_x = inc_x * sizeof(FLOAT);
        int idx = 0;

        if( n >= gvl && inc_x > 0 ) // don't pay overheads if we're not doing useful work
        {
                for(i=0; i<n/gvl; i++){
                        v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
                        nonzero_mask = VMFNE( v0, 0, gvl );
                        v0 = VFABS( v0, gvl );
                        scale_mask = VMFGT( v0, v_scale, gvl );

                        // assume scale changes are relatively infrequent

                        // unclear if the vcpop+branch is actually a win
                        // since the operations being skipped are predicated anyway
                        // need profiling to confirm
                        if( VCPOP(scale_mask, gvl) ) 
                        {
                                v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl );
                                v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl );
                                v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl );
                                v_scale = VMERGE( v_scale, v0, scale_mask, gvl );
                        }
                        v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl );
                        v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl );
                        idx += inc_x * gvl;
                }

                // we have gvl elements which we accumulated independently, with independent scales
                // we need to combine these
                // naive sort so we process small values first to avoid losing information
                // could use vector sort extensions where available, but we're dealing with gvl elts at most

                FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT));
                FLOAT * out_scale = alloca(gvl*sizeof(FLOAT));
                VSEV_FLOAT( out_ssq, v_ssq, gvl );
                VSEV_FLOAT( out_scale, v_scale, gvl );
                for( int a = 0; a < (gvl-1); ++a )
                {
                        int smallest = a;
                        for( size_t b = a+1; b < gvl; ++b )
                                if( out_scale[b] < out_scale[smallest] )
                                        smallest = b;
                        if( smallest != a )
                        {
                                FLOAT tmp1 = out_ssq[a];
                                FLOAT tmp2 = out_scale[a];
                                out_ssq[a] = out_ssq[smallest];
                                out_scale[a] = out_scale[smallest];
                                out_ssq[smallest] = tmp1;
                                out_scale[smallest] = tmp2;
                        }
                }

                int a = 0;
                while( a<gvl && out_scale[a] == 0 )
                        ++a;

                if( a < gvl ) 
                {
                        ssq = out_ssq[a];
                        scale = out_scale[a];
                        ++a;
                        for( ; a < gvl; ++a ) 
                        {
                                ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a];
                                scale = out_scale[a];
                        }
                }
        }

        //finish any tail using scalar ops
        i*=gvl*inc_x;
        n*=inc_x;
    if (n <= 0 || inc_x == 0) return(0.0);
    if ( n == 1 ) return( ABS(x[0]) );

    BLASLONG i = 0, j = 0;
 	FLOAT scale = 0.0, ssq = 0.0;

 	if( inc_x > 0 ){
 		FLOAT_V_T vr, v0, v_zero;
 		unsigned int gvl = 0;
 		FLOAT_V_T_M1 v_res, v_z0;
 		gvl = VSETVL_MAX;
 		v_res = VFMVVF_FLOAT_M1(0, gvl);
 		v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 		MASK_T mask;
 		BLASLONG index = 0;

 		if (inc_x == 1) {
 			gvl = VSETVL(n);
 			vr = VFMVVF_FLOAT(0, gvl);
 			v_zero = VFMVVF_FLOAT(0, gvl);
 			for (i = 0, j = 0; i < n / gvl; i++) {
 				v0 = VLEV_FLOAT(&x[j], gvl);
 				// fabs(vector)
 				v0 = VFABSV_FLOAT(v0, gvl);
 				// if scale change
 				mask = VMFGTVF_FLOAT(v0, scale, gvl);
 				index = VMFIRSTM(mask, gvl);
 				if (index == -1) {	// no elements greater than scale
 					if (scale != 0.0) {
 						v0 = VFDIVVF_FLOAT(v0, scale, gvl);
 						vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
 					}
 				}
 				else {	// found greater element
 					// ssq in vector vr: vr[0]
 					v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
 					// total ssq before current vector
 					ssq += VFMVFS_FLOAT(v_res);
 					// find max
 					v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
 					// update ssq before max_index
 					ssq = ssq * (scale / VFMVFS_FLOAT(v_res)) * (scale / VFMVFS_FLOAT(v_res));
 					// update scale
 					scale = VFMVFS_FLOAT(v_res);
 					// ssq in vector vr
 					v0 = VFDIVVF_FLOAT(v0, scale, gvl);
 					vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
 				}
 				j += gvl;
 			}
 			// ssq in vector vr: vr[0]
 			v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
 			// total ssq now
 			ssq += VFMVFS_FLOAT(v_res);

 			// tail processing
 			if(j < n){
 				gvl = VSETVL(n-j);
 				v0 = VLEV_FLOAT(&x[j], gvl);
 				// fabs(vector)
 				v0 = VFABSV_FLOAT(v0, gvl);
 				// if scale change
 				mask = VMFGTVF_FLOAT(v0, scale, gvl);
 				index = VMFIRSTM(mask, gvl);
 				if (index == -1) {	// no elements greater than scale
 					if(scale != 0.0)
 						v0 = VFDIVVF_FLOAT(v0, scale, gvl);
 				} else {	// found greater element
 					// find max
 					v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
 					// update ssq before max_index
 					ssq = ssq * (scale / VFMVFS_FLOAT(v_res))*(scale / VFMVFS_FLOAT(v_res));
 					// update scale
 					scale = VFMVFS_FLOAT(v_res);
 					v0 = VFDIVVF_FLOAT(v0, scale, gvl);
 				}
 				vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
 				// ssq in vector vr: vr[0]
 				v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
 				// total ssq now
 				ssq += VFMVFS_FLOAT(v_res);
 			}
 		}
 		else {
 			gvl = VSETVL(n);
 			vr = VFMVVF_FLOAT(0, gvl);
 			v_zero = VFMVVF_FLOAT(0, gvl);
 			unsigned int stride_x = inc_x * sizeof(FLOAT);
 			int idx = 0, inc_v = inc_x * gvl;
 			for (i = 0, j = 0; i < n / gvl; i++) {
 				v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
 				// fabs(vector)
 				v0 = VFABSV_FLOAT(v0, gvl);
 				// if scale change
 				mask = VMFGTVF_FLOAT(v0, scale, gvl);
 				index = VMFIRSTM(mask, gvl);
 				if (index == -1) {// no elements greater than scale
 					if(scale != 0.0){
 						v0 = VFDIVVF_FLOAT(v0, scale, gvl);
 						vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
 					}
 				}
 				else {	// found greater element
 					// ssq in vector vr: vr[0]
 					v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
 					// total ssq before current vector
 					ssq += VFMVFS_FLOAT(v_res);
 					// find max
 					v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
 					// update ssq before max_index
 					ssq = ssq * (scale / VFMVFS_FLOAT(v_res))*(scale / VFMVFS_FLOAT(v_res));
 					// update scale
 					scale = VFMVFS_FLOAT(v_res);
 					// ssq in vector vr
 					v0 = VFDIVVF_FLOAT(v0, scale, gvl);
 					vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
 				}
 				j += gvl;
 				idx += inc_v;
 			}
 			// ssq in vector vr: vr[0]
 			v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
 			// total ssq now
 			ssq += VFMVFS_FLOAT(v_res);

 			// tail processing
 			if (j < n) {
 				gvl = VSETVL(n-j);
 				v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
 				// fabs(vector)
 				v0 = VFABSV_FLOAT(v0, gvl);
 				// if scale change
 				mask = VMFGTVF_FLOAT(v0, scale, gvl);
 				index = VMFIRSTM(mask, gvl);
 				if(index == -1) {	// no elements greater than scale
 					if(scale != 0.0) {
 						v0 = VFDIVVF_FLOAT(v0, scale, gvl);
 						vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
 					}
 				}
 				else {	// found greater element
 					// find max
 					v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
 					// update ssq before max_index
 					ssq = ssq * (scale / VFMVFS_FLOAT(v_res))*(scale / VFMVFS_FLOAT(v_res));
 					// update scale
 					scale = VFMVFS_FLOAT(v_res);
 					v0 = VFDIVVF_FLOAT(v0, scale, gvl);
 					vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
 				}
 				// ssq in vector vr: vr[0]
 				v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
 				// total ssq now
 				ssq += VFMVFS_FLOAT(v_res);
 			}
 		}
 	}
 	else{
        // using scalar ops when inc_x < 0
        n *= inc_x;
        while(abs(i) < abs(n)){
                if ( x[i] != 0.0 ){
                        FLOAT absxi = ABS( x[i] );
                        if ( scale < absxi ){
                                ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
                                scale = absxi ;
                        }
                        else{
                                ssq += ( absxi/scale ) * ( absxi/scale );
                        }

                }

                i += inc_x;
 			if ( x[i] != 0.0 ){
 				FLOAT absxi = ABS( x[i] );
 				if ( scale < absxi ){
 					ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
 					scale = absxi ;
 				}
 				else{
 					ssq += ( absxi/scale ) * ( absxi/scale );
 				}

 			}
 			i += inc_x;
        }

 	}
 	return(scale * sqrt(ssq));
 }

--- a/openblas.pc.in
+++ b/openblas.pc.in
@@ -2,6 +2,6 @@ Name: openblas
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: ${version}
 URL: https://github.com/xianyi/OpenBLAS
 Libs: -L${libdir} -l${libprefix}openblas${libnamesuffix}
 Libs: -L${libdir} -l${libprefix}openblas${libsuffix}${libnamesuffix}
 Libs.private: ${extralib}
 Cflags: -I${includedir} ${omp_opt}