During the last iteration of some RVV operations, accumulators can get overwritten when VL < VLMAX and tail policy is agnostic. Commit changes intrinsics tail policy to undistrubed.tags/v0.3.27
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 | |||
| #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 | |||
| #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| @@ -79,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vx = VLEV_FLOAT(x_ptr, vl); | |||
| vr = VFMACCVV_FLOAT(vr, va, vx, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||
| @@ -103,7 +103,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vx = VLSEV_FLOAT(x_ptr, stride_x, vl); | |||
| vr = VFMACCVV_FLOAT(vr, va, vx, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||
| @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
| #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||
| @@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
| #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||
| @@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| VSEV_FLOAT(&y[i], vy, vl); | |||
| vx = VLEV_FLOAT(&x[i], vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||
| @@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy, vl); | |||
| vx = VLEV_FLOAT(&x[i], vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| iy += inc_yv; | |||
| } | |||
| @@ -163,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| VSEV_FLOAT(&y[i], vy, vl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| ix += inc_xv; | |||
| } | |||
| @@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy, vl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| ix += inc_xv; | |||
| iy += inc_yv; | |||
| @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
| #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||
| @@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
| #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||
| @@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| VSEV_FLOAT(&y[i], vy, vl); | |||
| vx = VLEV_FLOAT(&x[i], vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); | |||
| @@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy, vl); | |||
| vx = VLEV_FLOAT(&x[i], vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| iy += inc_yv; | |||
| } | |||
| @@ -163,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| VSEV_FLOAT(&y[i], vy, vl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| ix += inc_xv; | |||
| } | |||
| @@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy, vl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, vl); | |||
| vr = VFMACCVV_FLOAT(vr, vx, va, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| ix += inc_xv; | |||
| iy += inc_yv; | |||
| } | |||
| @@ -35,8 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 | |||
| #define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 | |||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 | |||
| #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu | |||
| #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 | |||
| @@ -49,8 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 | |||
| #define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 | |||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 | |||
| #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu | |||
| #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 | |||
| @@ -90,15 +90,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); | |||
| vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); | |||
| vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); | |||
| vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); | |||
| vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl); | |||
| vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); | |||
| vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl); | |||
| #else | |||
| vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); | |||
| vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); | |||
| vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); | |||
| vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl); | |||
| vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); | |||
| vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl); | |||
| #endif | |||
| j += vl * 2; | |||
| ix += vl * inc_x * 2; | |||
| @@ -134,15 +134,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); | |||
| vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); | |||
| vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); | |||
| vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); | |||
| vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl); | |||
| vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); | |||
| vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl); | |||
| #else | |||
| vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); | |||
| vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); | |||
| vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); | |||
| vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl); | |||
| vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); | |||
| vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl); | |||
| #endif | |||
| j += vl * 2; | |||
| ix += vl * inc_x * 2; | |||