| @@ -46,7 +46,7 @@ TARGET_FLAGS = -mips64r6 | |||
| endif | |||
| ifeq ($(TARGET), C910V) | |||
| TARGET_FLAGS = -march=rv64gcvxthead -mabi=lp64v | |||
| TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d | |||
| endif | |||
| all: getarch_2nd | |||
| @@ -1,4 +1,4 @@ | |||
| ifeq ($(CORE), C910V) | |||
| CCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v | |||
| FCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v -static | |||
| CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 | |||
| FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static | |||
| endif | |||
| @@ -92,7 +92,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define SEEK_ADDRESS | |||
| #if defined(C910V) | |||
| #include <riscv-vector.h> | |||
| #include <riscv_vector.h> | |||
| #endif | |||
| #endif | |||
| @@ -88,8 +88,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| @@ -97,8 +97,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| @@ -113,8 +113,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| @@ -122,8 +122,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| @@ -131,7 +131,8 @@ asm volatile( | |||
| j += gvl*2; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||
| maxf = v_res[0]; | |||
| maxf = *((FLOAT*)&v_res); | |||
| //maxf = v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -144,8 +145,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| @@ -153,14 +154,14 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||
| if(v_res[0] > maxf) | |||
| maxf = v_res[0]; | |||
| if(*((FLOAT*)&v_res) > maxf) | |||
| maxf = *((FLOAT*)&v_res); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| @@ -179,8 +180,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| @@ -188,8 +189,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| @@ -204,8 +205,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| @@ -213,8 +214,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| @@ -223,7 +224,7 @@ asm volatile( | |||
| ix += inc_xv*2; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||
| maxf = v_res[0]; | |||
| maxf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -236,8 +237,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| @@ -245,14 +246,14 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||
| if(v_res[0] > maxf) | |||
| maxf = v_res[0]; | |||
| if(*((FLOAT*)&v_res) > maxf) | |||
| maxf = *((FLOAT*)&v_res); | |||
| j += gvl; | |||
| } | |||
| } | |||
| @@ -87,8 +87,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| @@ -96,8 +96,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_min = VFMINVV_FLOAT(v_min, v0, gvl); | |||
| @@ -111,8 +111,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| @@ -120,8 +120,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| @@ -129,7 +129,7 @@ asm volatile( | |||
| j += gvl*2; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| minf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -142,8 +142,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| @@ -151,13 +151,13 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(v_res[0] < minf) | |||
| minf = v_res[0]; | |||
| if(*((FLOAT*)&v_res) < minf) | |||
| minf = *((FLOAT*)&v_res); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| @@ -176,8 +176,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| @@ -185,8 +185,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_min = VFMINVV_FLOAT(v_min, v0, gvl); | |||
| @@ -200,8 +200,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| @@ -209,8 +209,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| @@ -219,7 +219,7 @@ asm volatile( | |||
| idx += inc_xv*2; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| minf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -232,8 +232,8 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| @@ -241,13 +241,13 @@ asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(v0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(v_res[0] < minf) | |||
| minf = v_res[0]; | |||
| if(*((FLOAT*)&v_res) < minf) | |||
| minf = *((FLOAT*)&v_res); | |||
| j += gvl; | |||
| } | |||
| } | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 | |||
| #define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| @@ -89,7 +89,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl * 2; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| asumf += *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -97,7 +97,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| asumf += *((FLOAT*)&v_res); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| @@ -121,7 +121,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| inc_xv += inc_xv * 2; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| asumf += *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -129,7 +129,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| asumf += *((FLOAT*)&v_res); | |||
| j += gvl; | |||
| } | |||
| } | |||
| @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSEV_FLOAT vse_v_f32m8 | |||
| #define VSSEV_FLOAT vsse_v_f32m8 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| @@ -1,5 +1,5 @@ | |||
| #include "common.h" | |||
| #include <riscv-vector.h> | |||
| #include <riscv_vector.h> | |||
| #define KERNEL8x4_I \ | |||
| "addi t1, %[PB], 1*8 \n\t"\ | |||
| @@ -31,9 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| @@ -43,9 +44,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| @@ -81,7 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| } | |||
| if(j > 0){ | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| @@ -92,12 +94,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| } | |||
| }else if(inc_y == 1){ | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| int stride_x = inc_x * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy = VLEV_FLOAT(&y[j], gvl); | |||
| @@ -106,7 +108,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| } | |||
| if(j > 0){ | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| @@ -117,12 +120,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| unsigned int stride_y = inc_y * sizeof(FLOAT); | |||
| int stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| @@ -131,7 +135,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| } | |||
| if(j > 0){ | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| @@ -142,13 +147,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| unsigned int stride_y = inc_y * sizeof(FLOAT); | |||
| int stride_x = inc_x * sizeof(FLOAT); | |||
| int stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| @@ -157,7 +163,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| } | |||
| if(j > 0){ | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| @@ -168,7 +175,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| return(dot); | |||
| @@ -31,9 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| @@ -44,9 +45,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| @@ -80,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| j += gvl; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp = v_res[0]; | |||
| temp = (FLOAT)VFMVFS_FLOAT(v_res); | |||
| if(j < m){ | |||
| gvl = VSETVL(m-j); | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| @@ -88,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp += v_res[0]; | |||
| temp += (FLOAT)VFMVFS_FLOAT(v_res); | |||
| } | |||
| y[iy] += alpha * temp; | |||
| iy += inc_y; | |||
| @@ -96,9 +98,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| } | |||
| }else{ | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| for(i = 0; i < n; i++){ | |||
| gvl = VSETVL(m); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| j = 0; | |||
| ix = 0; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| @@ -110,7 +113,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp = v_res[0]; | |||
| temp = (FLOAT)VFMVFS_FLOAT(v_res); | |||
| if(j < m){ | |||
| gvl = VSETVL(m-j); | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| @@ -118,7 +121,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp += v_res[0]; | |||
| temp += (FLOAT)VFMVFS_FLOAT(v_res); | |||
| } | |||
| y[iy] += alpha * temp; | |||
| iy += inc_y; | |||
| @@ -117,10 +117,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| maxf = v_res[0]; | |||
| maxf = *((FLOAT*)&v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -130,7 +130,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| FLOAT cur_maxf = v_res[0]; | |||
| FLOAT cur_maxf = *((FLOAT*)&v_res); | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| @@ -138,7 +138,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| } | |||
| } | |||
| }else{ | |||
| @@ -165,10 +165,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| idx += inc_v; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| maxf = v_res[0]; | |||
| maxf = *((FLOAT*)&v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -178,7 +178,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| FLOAT cur_maxf = v_res[0]; | |||
| FLOAT cur_maxf = *((FLOAT*)&v_res); | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| @@ -186,7 +186,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| } | |||
| } | |||
| } | |||
| @@ -118,10 +118,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| minf = *((FLOAT*)&v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -131,7 +131,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = v_res[0]; | |||
| FLOAT cur_minf = *((FLOAT*)&v_res); | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| @@ -139,7 +139,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| } | |||
| } | |||
| }else{ | |||
| @@ -166,10 +166,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| idx += inc_v; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| minf = *((FLOAT*)&v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -179,7 +179,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = v_res[0]; | |||
| FLOAT cur_minf = *((FLOAT*)&v_res); | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| @@ -187,7 +187,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| } | |||
| } | |||
| } | |||
| @@ -111,17 +111,17 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| maxf = v_res[0]; | |||
| maxf = *((FLOAT*)&v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| v_max = VLEV_FLOAT(&x[j], gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| FLOAT cur_maxf = v_res[0]; | |||
| FLOAT cur_maxf = *((FLOAT*)&v_res); | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| @@ -129,7 +129,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| } | |||
| } | |||
| }else{ | |||
| @@ -153,17 +153,17 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| idx += inc_v; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| maxf = v_res[0]; | |||
| maxf = *((FLOAT*)&v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| FLOAT cur_maxf = v_res[0]; | |||
| FLOAT cur_maxf = *((FLOAT*)&v_res); | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| @@ -171,7 +171,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| } | |||
| } | |||
| } | |||
| @@ -129,24 +129,24 @@ asm volatile( | |||
| j += gvl; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| minf = *((FLOAT*)&v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| v_min = VLEV_FLOAT(&x[j], gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = v_res[0]; | |||
| FLOAT cur_minf = *((FLOAT*)&v_res); | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| v_min_index = VADDVX_UINT(v_min_index, j, gvl); | |||
| mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| } | |||
| } | |||
| }else{ | |||
| @@ -190,24 +190,24 @@ asm volatile( | |||
| idx += inc_v; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| minf = *((FLOAT*)&v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = v_res[0]; | |||
| FLOAT cur_minf = *((FLOAT*)&v_res); | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| v_min_index = VADDVX_UINT(v_min_index, j, gvl); | |||
| mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| } | |||
| } | |||
| } | |||
| @@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| @@ -46,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VSEVU_UINT vse64_v_u64m8 | |||
| #define UINT_T long unsigned int | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| @@ -59,6 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| @@ -71,6 +75,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define UINT_T unsigned int | |||
| #define VSEVU_UINT vse32_v_u32m8 | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| @@ -98,6 +104,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| gvl = VSETVL(n); | |||
| UINT_T temp_uint[gvl]; | |||
| v_max_index = VMVVX_UINT(0, gvl); | |||
| v_max = VFMVVF_FLOAT(-1, gvl); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| @@ -183,10 +190,12 @@ asm volatile( | |||
| } | |||
| vx0 = VFMVVF_FLOAT(0, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| maxf = v_res[0]; | |||
| maxf = VFMVFS_FLOAT(v_res); | |||
| mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask0,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| VSEVU_UINT(temp_uint,v_max_index,gvl); | |||
| max_index = temp_uint[max_index]; | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -239,7 +248,7 @@ asm volatile( | |||
| */ | |||
| v_max = VFADDVV_FLOAT(vx0, vx1, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| FLOAT cur_maxf = v_res[0]; | |||
| FLOAT cur_maxf = VFMVFS_FLOAT(v_res); | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| @@ -247,7 +256,9 @@ asm volatile( | |||
| mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||
| max_index = VMFIRSTM(mask0,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| VSEVU_UINT(temp_uint,v_max_index,gvl); | |||
| max_index = temp_uint[max_index]; | |||
| } | |||
| } | |||
| return(max_index+1); | |||
| @@ -35,6 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| @@ -47,6 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VSEVU_UINT vse64_v_u64m8 | |||
| #define UINT_T long unsigned int | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| @@ -60,6 +63,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| @@ -72,6 +76,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define UINT_T unsigned int | |||
| #define VSEVU_UINT vse32_v_u32m8 | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| @@ -98,6 +104,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| gvl = VSETVL(n); | |||
| UINT_T temp_uint[gvl]; | |||
| v_min_index = VMVVX_UINT(0, gvl); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| @@ -182,10 +189,11 @@ asm volatile( | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| minf = VFMVFS_FLOAT(v_res); | |||
| mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask0,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| VSEVU_UINT(temp_uint,v_min_index,gvl); | |||
| min_index = temp_uint[min_index]; | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -238,7 +246,7 @@ asm volatile( | |||
| */ | |||
| v_min = VFADDVV_FLOAT(vx0, vx1, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = v_res[0]; | |||
| FLOAT cur_minf = VFMVFS_FLOAT(v_res); | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| @@ -246,7 +254,9 @@ asm volatile( | |||
| mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||
| min_index = VMFIRSTM(mask0,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| VSEVU_UINT(temp_uint,v_min_index,gvl); | |||
| min_index = temp_uint[min_index]; | |||
| } | |||
| } | |||
| return(min_index+1); | |||
| @@ -77,14 +77,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl * 2; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| maxf = v_res[0]; | |||
| maxf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); | |||
| if(v_res[0] > maxf) | |||
| maxf = v_res[0]; | |||
| if(*((FLOAT*)&v_res) > maxf) | |||
| maxf = *((FLOAT*)&v_res); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| @@ -103,14 +103,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| idx += inc_xv * 2; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| maxf = v_res[0]; | |||
| maxf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); | |||
| if(v_res[0] > maxf) | |||
| maxf = v_res[0]; | |||
| if(*((FLOAT*)&v_res) > maxf) | |||
| maxf = *((FLOAT*)&v_res); | |||
| j += gvl; | |||
| } | |||
| } | |||
| @@ -77,14 +77,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl * 2; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| minf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(v_res[0] < minf) | |||
| minf = v_res[0]; | |||
| if(*((FLOAT*)&v_res) < minf) | |||
| minf = *((FLOAT*)&v_res); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| @@ -103,14 +103,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| idx += inc_xv * 2; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| minf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(v_res[0] < minf) | |||
| minf = v_res[0]; | |||
| if(*((FLOAT*)&v_res) < minf) | |||
| minf = *((FLOAT*)&v_res); | |||
| j += gvl; | |||
| } | |||
| } | |||
| @@ -30,10 +30,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32 | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| @@ -50,10 +52,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64 | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| @@ -106,13 +110,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = v_res[0]; | |||
| scale = VFMVFS_FLOAT(v_res); | |||
| //ssq in vector vr | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| @@ -122,7 +126,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //tail | |||
| if(j < n){ | |||
| @@ -141,16 +145,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = v_res[0]; | |||
| scale = VFMVFS_FLOAT(v_res); | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| } | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| @@ -175,13 +179,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = v_res[0]; | |||
| scale = VFMVFS_FLOAT(v_res); | |||
| //ssq in vector vr | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| @@ -192,7 +196,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //tail | |||
| if(j < n){ | |||
| @@ -211,16 +215,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = vr[0]; | |||
| scale = VFMVFS_FLOATM4(vr); | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| } | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| return(scale * sqrt(ssq)); | |||
| @@ -31,9 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m8_f32m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| @@ -44,9 +45,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m8_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| @@ -83,7 +85,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| len += v_res[0]; | |||
| len += VFMVFS_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| for(;j < n;){ | |||
| @@ -94,7 +96,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //vr = VFDOTVV_FLOAT(v0, v0, gvl); | |||
| vr = VFMACCVV_FLOAT(v1, v0, v0, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| len += v_res[0]; | |||
| len += VFMVFS_FLOAT(v_res); | |||
| j += gvl; | |||
| } | |||
| @@ -113,7 +115,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| len += v_res[0]; | |||
| len += VFMVFS_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| for(;j < n;){ | |||
| @@ -124,7 +126,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //vr = VFDOTVV_FLOAT(v0, v0, gvl); | |||
| vr = VFMACCVV_FLOAT(v1, v0, v0, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| len += v_res[0]; | |||
| len += VFMVFS_FLOAT(v_res); | |||
| j += gvl; | |||
| } | |||
| @@ -85,19 +85,24 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| }else{ | |||
| if(da == 0.0){ | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG ix = 0; | |||
| if(gvl <= n / 2){ | |||
| long int inc_xv = gvl * inc_x; | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){ | |||
| VSEV_FLOAT(&x[j], v0, gvl); | |||
| VSEV_FLOAT(&x[j+gvl], v0, gvl); | |||
| VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||
| VSSEV_FLOAT(&x[ix + inc_xv], stride_x, v0, gvl); | |||
| ix += inc_xv * 2; | |||
| } | |||
| } | |||
| //tail | |||
| for(; j <n; ){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| VSEV_FLOAT(&x[j], v0, gvl); | |||
| VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||
| j += gvl; | |||
| ix += inc_x * gvl; | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| @@ -1,5 +1,5 @@ | |||
| #include "common.h" | |||
| #include <riscv-vector.h> | |||
| #include <riscv_vector.h> | |||
| #define KERNEL16x4_I \ | |||
| "addi t1, %[PB], 1*4 \n\t"\ | |||
| @@ -31,11 +31,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSEV_FLOAT vse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| @@ -46,11 +47,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSEV_FLOAT vse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| @@ -98,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| if(i < m){ | |||
| gvl = VSETVL(m-i); | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| @@ -109,7 +111,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLEV_FLOAT(&x[i], gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[j] += alpha * temp2; | |||
| @@ -143,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| if(i < m){ | |||
| gvl = VSETVL(m-i); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -154,7 +156,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLEV_FLOAT(&x[i], gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[jy] += alpha * temp2; | |||
| @@ -189,7 +191,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| if(i < m){ | |||
| gvl = VSETVL(m-i); | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| @@ -200,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[j] += alpha * temp2; | |||
| @@ -240,7 +242,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| if(i < m){ | |||
| gvl = VSETVL(m-i); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -251,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[jy] += alpha * temp2; | |||
| @@ -31,11 +31,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSEV_FLOAT vse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| @@ -47,11 +48,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSEV_FLOAT vse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| @@ -100,7 +102,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| if(i < j){ | |||
| gvl = VSETVL(j-i); | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| @@ -111,7 +113,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLEV_FLOAT(&x[i], gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[j] += temp1 * a_ptr[j] + alpha * temp2; | |||
| @@ -144,7 +146,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| if(i < j){ | |||
| gvl = VSETVL(j-i); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -155,7 +157,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLEV_FLOAT(&x[i], gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[jy] += temp1 * a_ptr[j] + alpha * temp2; | |||
| @@ -189,7 +191,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| if(i < j){ | |||
| gvl = VSETVL(j-i); | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| @@ -200,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[j] += temp1 * a_ptr[j] + alpha * temp2; | |||
| @@ -239,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| if(i < j){ | |||
| gvl = VSETVL(j-i); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -250,7 +252,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[jy] += temp1 * a_ptr[j] + alpha * temp2; | |||
| @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| @@ -42,11 +43,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| @@ -56,6 +59,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| @@ -91,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| maxf = v_res[0]; | |||
| maxf = VFMVFS_FLOAT(v_res); | |||
| if(j<n){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -103,8 +107,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v1 = VFADDVV_FLOAT(v0, v1, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl); | |||
| if(v_res[0] > maxf) | |||
| maxf = v_res[0]; | |||
| if(VFMVFS_FLOAT(v_res)> maxf) | |||
| maxf = VFMVFS_FLOAT(v_res); | |||
| } | |||
| return(maxf); | |||
| } | |||
| @@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| @@ -48,6 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| @@ -92,7 +94,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| minf = VFMVFS_FLOAT(v_res); | |||
| if(j<n){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -104,8 +106,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v1 = VFADDVV_FLOAT(v0, v1, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v1, v_max, gvl); | |||
| if(v_res[0] < minf) | |||
| minf = v_res[0]; | |||
| if(VFMVFS_FLOAT(v_res) < minf) | |||
| minf = VFMVFS_FLOAT(v_res); | |||
| } | |||
| return(minf); | |||
| } | |||
| @@ -33,9 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| @@ -47,9 +48,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| @@ -90,7 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl * 2; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| asumf += VFFMVFS_FLOAT(v_res); | |||
| } | |||
| for(;j<n2;){ | |||
| gvl = VSETVL(n2-j); | |||
| @@ -98,7 +100,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| asumf += VFFMVFS_FLOAT(v_res); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| @@ -123,7 +125,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| asumf += VFFMVFS_FLOAT(v_res); | |||
| if(j<n){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| @@ -135,7 +137,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v0, v1, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| asumf += VFFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| return(asumf); | |||
| @@ -31,9 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| @@ -46,9 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| @@ -108,9 +110,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||
| dot[0] += v_res[0]; | |||
| dot[0] += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||
| dot[1] += v_res[0]; | |||
| dot[1] += VFMVFS_FLOAT(v_res); | |||
| //tail | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| @@ -131,9 +133,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl); | |||
| #endif | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||
| dot[0] += v_res[0]; | |||
| dot[0] += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||
| dot[1] += v_res[0]; | |||
| dot[1] += VFMVFS_FLOAT(v_res); | |||
| } | |||
| CREAL(result) = dot[0]; | |||
| CIMAG(result) = dot[1]; | |||
| @@ -31,8 +31,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| @@ -43,8 +44,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| @@ -100,9 +102,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp_r = v_res[0]; | |||
| temp_r = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl); | |||
| temp_i = v_res[0]; | |||
| temp_i = VFMVFS_FLOAT(v_res); | |||
| if(j/2 < m){ | |||
| gvl = VSETVL(m-j/2); | |||
| va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | |||
| @@ -122,9 +124,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| #endif | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp_r += v_res[0]; | |||
| temp_r += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl); | |||
| temp_i += v_res[0]; | |||
| temp_i += VFMVFS_FLOAT(v_res); | |||
| } | |||
| #if !defined(XCONJ) | |||
| y[iy] += alpha_r * temp_r - alpha_i * temp_i; | |||
| @@ -31,9 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| @@ -46,9 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| @@ -142,9 +144,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||
| ia += inc_av; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||
| temp_r2 = v_res[0]; | |||
| temp_r2 = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||
| temp_i2 = v_res[0]; | |||
| temp_i2 = VFMVFS_FLOAT(v_res); | |||
| if(i < m){ | |||
| gvl = VSETVL(m-i); | |||
| va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); | |||
| @@ -180,9 +182,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||
| #endif | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||
| temp_r2 += v_res[0]; | |||
| temp_r2 += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||
| temp_i2 += v_res[0]; | |||
| temp_i2 += VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||
| @@ -31,9 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| @@ -46,9 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| @@ -141,9 +143,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||
| ia += inc_av; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||
| temp_r2 = v_res[0]; | |||
| temp_r2 = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||
| temp_i2 = v_res[0]; | |||
| temp_i2 = VFMVFS_FLOAT(v_res); | |||
| if(i < j){ | |||
| gvl = VSETVL(j-i); | |||
| va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); | |||
| @@ -179,9 +181,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||
| #endif | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||
| temp_r2 += v_res[0]; | |||
| temp_r2 += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||
| temp_i2 += v_res[0]; | |||
| temp_i2 += VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[jy] += temp_r1 * a_ptr[ja]; | |||
| @@ -31,9 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| @@ -51,9 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| @@ -107,13 +109,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = v_res[0]; | |||
| scale = VFMVFS_FLOAT(v_res); | |||
| //ssq in vector vr | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| @@ -123,7 +125,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //tail | |||
| if(j < n2){ | |||
| @@ -142,16 +144,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = v_res[0]; | |||
| scale = VFMVFS_FLOAT(v_res); | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| } | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| @@ -176,13 +178,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = v_res[0]; | |||
| scale = VFMVFS_FLOAT(v_res); | |||
| //ssq in vector vr | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| @@ -204,13 +206,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = v_res[0]; | |||
| scale = VFMVFS_FLOAT(v_res); | |||
| //ssq in vector vr | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| @@ -221,7 +223,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //tail | |||
| if(j < n){ | |||
| @@ -242,9 +244,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = v_res[0]; | |||
| scale = VFMVFS_FLOAT(v_res); | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| } | |||
| @@ -265,20 +267,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = v_res[0]; | |||
| scale = VFMVFS_FLOAT(v_res); | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| } | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += v_res[0]; | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| } | |||
| } | |||
| return(scale * sqrt(ssq)); | |||