| @@ -27,110 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) | |||
| #define VSETVL_MAX_M1 RISCV_RVV(vsetvlmax_e32m1) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m2_f32m1(v_res, va, vb, gvl) | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m8_f32m1(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m2_f32m1) | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1) | |||
| #endif | |||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2) | |||
| #define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m8) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2) | |||
| #define xint_t int | |||
| #else | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) | |||
| #define VSETVL_MAX_M1 RISCV_RVV(vsetvlmax_e64m1) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m2_f64m1(v_res, va, vb, gvl) | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m8_f64m1(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m2_f64m1) | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1) | |||
| #endif | |||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2) | |||
| #define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m8) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2) | |||
| #define xint_t long long | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i = 0, j = 0, k = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT *a_ptr = a; | |||
| FLOAT temp; | |||
| BLASLONG i = 0, j = 0, k = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT *a_ptr = a; | |||
| FLOAT temp; | |||
| FLOAT_V_T va, vr, vx; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res; | |||
| FLOAT_V_T va, vr, vx; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res; | |||
| size_t vlmax = VSETVL_MAX_M1(); | |||
| #ifndef RISCV_0p10_INTRINSICS | |||
| FLOAT_V_T va0, va1, va2, va3, vr0, vr1, vr2, vr3; | |||
| FLOAT_V_T_M1 vec0, vec1, vec2, vec3; | |||
| FLOAT *a_ptrs[4], *y_ptrs[4]; | |||
| #endif | |||
| if(inc_x == 1){ | |||
| for(i = 0; i < n; i++){ | |||
| v_res = VFMVVF_FLOAT_M1(0, 1); | |||
| gvl = VSETVL(m); | |||
| j = 0; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(k = 0; k < m/gvl; k++){ | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop | |||
| v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail | |||
| j += gvl; | |||
| } | |||
| if(j < m){ | |||
| gvl = VSETVL(m-j); | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | |||
| } | |||
| temp = (FLOAT)EXTRACT_FLOAT(v_res); | |||
| y[iy] += alpha * temp; | |||
| if(inc_x == 1){ | |||
| #ifndef RISCV_0p10_INTRINSICS | |||
| BLASLONG anr = n - n % 4; | |||
| for (; i < anr; i += 4) { | |||
| gvl = VSETVL(m); | |||
| j = 0; | |||
| for (int l = 0; l < 4; l++) { | |||
| a_ptrs[l] = a + (i + l) * lda; | |||
| y_ptrs[l] = y + (i + l) * inc_y; | |||
| } | |||
| vec0 = VFMVVF_FLOAT_M1(0.0, vlmax); | |||
| vec1 = VFMVVF_FLOAT_M1(0.0, vlmax); | |||
| vec2 = VFMVVF_FLOAT_M1(0.0, vlmax); | |||
| vec3 = VFMVVF_FLOAT_M1(0.0, vlmax); | |||
| vr0 = VFMVVF_FLOAT(0.0, gvl); | |||
| vr1 = VFMVVF_FLOAT(0.0, gvl); | |||
| vr2 = VFMVVF_FLOAT(0.0, gvl); | |||
| vr3 = VFMVVF_FLOAT(0.0, gvl); | |||
| for (k = 0; k < m / gvl; k++) { | |||
| va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl); | |||
| va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl); | |||
| va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl); | |||
| va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl); | |||
| vx = VLEV_FLOAT(x + j, gvl); | |||
| vr0 = VFMULVV_FLOAT(va0, vx, gvl); | |||
| vr1 = VFMULVV_FLOAT(va1, vx, gvl); | |||
| vr2 = VFMULVV_FLOAT(va2, vx, gvl); | |||
| vr3 = VFMULVV_FLOAT(va3, vx, gvl); | |||
| // Floating-point addition does not satisfy the associative law, that is, (a + b) + c ≠ a + (b + c), | |||
| // so piecewise multiplication and reduction must be performed inside the loop body. | |||
| vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl); | |||
| vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl); | |||
| vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl); | |||
| vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl); | |||
| j += gvl; | |||
| } | |||
| if (j < m) { | |||
| gvl = VSETVL(m - j); | |||
| va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl); | |||
| va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl); | |||
| va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl); | |||
| va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl); | |||
| iy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| }else{ | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for(i = 0; i < n; i++){ | |||
| v_res = VFMVVF_FLOAT_M1(0, 1); | |||
| gvl = VSETVL(m); | |||
| j = 0; | |||
| ix = 0; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(k = 0; k < m/gvl; k++){ | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | |||
| j += gvl; | |||
| ix += inc_x * gvl; | |||
| } | |||
| if(j < m){ | |||
| gvl = VSETVL(m-j); | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | |||
| } | |||
| temp = (FLOAT)EXTRACT_FLOAT(v_res); | |||
| y[iy] += alpha * temp; | |||
| vx = VLEV_FLOAT(x + j, gvl); | |||
| vr0 = VFMULVV_FLOAT(va0, vx, gvl); | |||
| vr1 = VFMULVV_FLOAT(va1, vx, gvl); | |||
| vr2 = VFMULVV_FLOAT(va2, vx, gvl); | |||
| vr3 = VFMULVV_FLOAT(va3, vx, gvl); | |||
| vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl); | |||
| vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl); | |||
| vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl); | |||
| vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl); | |||
| } | |||
| *y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(vec0)); | |||
| *y_ptrs[1] += alpha * (FLOAT)(EXTRACT_FLOAT(vec1)); | |||
| *y_ptrs[2] += alpha * (FLOAT)(EXTRACT_FLOAT(vec2)); | |||
| *y_ptrs[3] += alpha * (FLOAT)(EXTRACT_FLOAT(vec3)); | |||
| } | |||
| // deal with the tail | |||
| for (; i < n; i++) { | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| gvl = VSETVL(m); | |||
| j = 0; | |||
| a_ptrs[0] = a + i * lda; | |||
| y_ptrs[0] = y + i * inc_y; | |||
| vr0 = VFMVVF_FLOAT(0, gvl); | |||
| for (k = 0; k < m / gvl; k++) { | |||
| va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl); | |||
| vx = VLEV_FLOAT(x + j, gvl); | |||
| vr0 = VFMULVV_FLOAT(va0, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(vr0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| if (j < m) { | |||
| gvl = VSETVL(m - j); | |||
| va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl); | |||
| vx = VLEV_FLOAT(x + j, gvl); | |||
| vr0 = VFMULVV_FLOAT(va0, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(vr0, v_res, gvl); | |||
| } | |||
| *y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(v_res)); | |||
| } | |||
| #else | |||
| for(i = 0; i < n; i++){ | |||
| v_res = VFMVVF_FLOAT_M1(0, 1); | |||
| gvl = VSETVL(m); | |||
| j = 0; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(k = 0; k < m/gvl; k++){ | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop | |||
| v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail | |||
| j += gvl; | |||
| } | |||
| if(j < m){ | |||
| gvl = VSETVL(m-j); | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | |||
| } | |||
| temp = (FLOAT)EXTRACT_FLOAT(v_res); | |||
| y[iy] += alpha * temp; | |||
| iy += inc_y; | |||
| a_ptr += lda; | |||
| iy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| #endif | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for(i = 0; i < n; i++){ | |||
| v_res = VFMVVF_FLOAT_M1(0, 1); | |||
| gvl = VSETVL(m); | |||
| j = 0; | |||
| ix = 0; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(k = 0; k < m/gvl; k++){ | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | |||
| j += gvl; | |||
| ix += inc_x * gvl; | |||
| } | |||
| if(j < m){ | |||
| gvl = VSETVL(m-j); | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | |||
| } | |||
| } | |||
| temp = (FLOAT)EXTRACT_FLOAT(v_res); | |||
| y[iy] += alpha * temp; | |||
| return(0); | |||
| iy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||