/* AUTOGENERATED KERNEL Script: ./kernel/riscv64/generate_kernel.py Settings: LMUL=4 M=8 M_tail_scalar_from=2 N=4 __riscv_='__riscv_' complex=False conjugate=False cpu='zvl128b' force_acc_double=False index_type='BLASLONG' op='gemm' param_precision='double' reg_width_bits=128 tail_policy='' trace=False Derived: ELEN_ACC=64 ELEN_PARAM=64 LMUL_ACC=4 VFMACC='__riscv_vfmacc_vf_f64m4' VFMUL='__riscv_vfmul_vf_f64m4' VLEV='__riscv_vle64_v_f64m4' VLSEV='__riscv_vlse64_v_f64m4' VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' VSETVL='__riscv_vsetvl_e64m4' VSEV='__riscv_vse64_v_f64m4' VSSEV='__riscv_vsse64_v_f64m4' acc_vector_t='vfloat64m4_t' output='dgemm_kernel_8x4_zvl128b.c' param_scalar_t='double' param_vector_t='vfloat64m4_t' */ #include "common.h" int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) { BLASLONG gvl = 0; BLASLONG m_top = 0; BLASLONG n_top = 0; // -- MAIN PASS for (BLASLONG j = 0; j < N / 4; j += 1) { m_top = 0; BLASLONG gvl = __riscv_vsetvl_e64m4(8); for (BLASLONG i = 0; i < M / 8; i += 1) { BLASLONG ai = m_top * K; BLASLONG bi = n_top * K; double B0 = B[bi + 0]; double B1 = B[bi + 1]; double B2 = B[bi + 2]; double B3 = B[bi + 3]; bi += 4; vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); ai += 8; vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); for (BLASLONG k = 1; k < K; k++) { B0 = B[bi + 0]; B1 = B[bi + 1]; B2 = B[bi + 2]; B3 = B[bi + 3]; bi += 4; A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); ai += 8; result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); } BLASLONG ci = n_top * ldc + m_top; vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); ci += ldc - gvl * 0; vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); ci += ldc - gvl * 0; vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); ci += ldc - gvl * 0; vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); ci = n_top * ldc + m_top; __riscv_vse64_v_f64m4(&C[ci], c0, gvl); ci += ldc - gvl * 0; __riscv_vse64_v_f64m4(&C[ci], c1, gvl); ci += ldc - gvl * 0; __riscv_vse64_v_f64m4(&C[ci], c2, gvl); ci += ldc - gvl * 0; __riscv_vse64_v_f64m4(&C[ci], c3, gvl); m_top += 8; } // -- tails for main pass if (M & 4) { gvl = __riscv_vsetvl_e64m4(4); BLASLONG ai = m_top * K; BLASLONG bi = n_top * K; double B0 = B[bi + 0]; double B1 = B[bi + 1]; double B2 = B[bi + 2]; double B3 = B[bi + 3]; bi += 4; vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); ai += 4; vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); for (BLASLONG k = 1; k < K; k++) { B0 = B[bi + 0]; B1 = B[bi + 1]; B2 = B[bi + 2]; B3 = B[bi + 3]; bi += 4; A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); ai += 4; result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); } BLASLONG ci = n_top * ldc + m_top; vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); ci += ldc - gvl * 0; vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); ci += ldc - gvl * 0; vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); ci += ldc - gvl * 0; vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); ci = n_top * ldc + m_top; __riscv_vse64_v_f64m4(&C[ci], c0, gvl); ci += ldc - gvl * 0; __riscv_vse64_v_f64m4(&C[ci], c1, gvl); ci += ldc - gvl * 0; __riscv_vse64_v_f64m4(&C[ci], c2, gvl); ci += ldc - gvl * 0; __riscv_vse64_v_f64m4(&C[ci], c3, gvl); m_top += 4; } if (M & 2) { double result0 = 0; double result1 = 0; double result2 = 0; double result3 = 0; double result4 = 0; double result5 = 0; double result6 = 0; double result7 = 0; BLASLONG ai = m_top * K; BLASLONG bi = n_top * K; for (BLASLONG k = 0; k < K; k++) { result0 += A[ai + 0] * B[bi + 0]; result1 += A[ai + 1] * B[bi + 0]; result2 += A[ai + 0] * B[bi + 1]; result3 += A[ai + 1] * B[bi + 1]; result4 += A[ai + 0] * B[bi + 2]; result5 += A[ai + 1] * B[bi + 2]; result6 += A[ai + 0] * B[bi + 3]; result7 += A[ai + 1] * B[bi + 3]; ai += 2; bi += 4; } BLASLONG ci = n_top * ldc + m_top; C[ci + 0 * ldc + 0] += alpha * result0; C[ci + 0 * ldc + 1] += alpha * result1; C[ci + 1 * ldc + 0] += alpha * result2; C[ci + 1 * ldc + 1] += alpha * result3; C[ci + 2 * ldc + 0] += alpha * result4; C[ci + 2 * ldc + 1] += alpha * result5; C[ci + 3 * ldc + 0] += alpha * result6; C[ci + 3 * ldc + 1] += alpha * result7; m_top += 2; } if (M & 1) { double result0 = 0; double result1 = 0; double result2 = 0; double result3 = 0; BLASLONG ai = m_top * K; BLASLONG bi = n_top * K; for (BLASLONG k = 0; k < K; k++) { result0 += A[ai + 0] * B[bi + 0]; result1 += A[ai + 0] * B[bi + 1]; result2 += A[ai + 0] * B[bi + 2]; result3 += A[ai + 0] * B[bi + 3]; ai += 1; bi += 4; } BLASLONG ci = n_top * ldc + m_top; C[ci + 0 * ldc + 0] += alpha * result0; C[ci + 1 * ldc + 0] += alpha * result1; C[ci + 2 * ldc + 0] += alpha * result2; C[ci + 3 * ldc + 0] += alpha * result3; m_top += 1; } n_top += 4; } // -- tails for N=2 if (N & 2) { gvl = __riscv_vsetvl_e64m4(8); m_top = 0; for (BLASLONG i = 0; i < M / 8; i += 1) { BLASLONG ai = m_top * K; BLASLONG bi = n_top * K; double B0 = B[bi + 0]; double B1 = B[bi + 1]; bi += 2; vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); ai += 8; vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); for (BLASLONG k = 1; k < K; k++) { B0 = B[bi + 0]; B1 = B[bi + 1]; bi += 2; A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); ai += 8; result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); } BLASLONG ci = n_top * ldc + m_top; vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); ci += ldc - gvl * 0; vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); ci = n_top * ldc + m_top; __riscv_vse64_v_f64m4(&C[ci], c0, gvl); ci += ldc - gvl * 0; __riscv_vse64_v_f64m4(&C[ci], c1, gvl); m_top += 8; } if (M & 4) { gvl = __riscv_vsetvl_e64m4(4); BLASLONG ai = m_top * K; BLASLONG bi = n_top * K; double B0 = B[bi + 0]; double B1 = B[bi + 1]; bi += 2; vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); ai += 4; vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); for (BLASLONG k = 1; k < K; k++) { B0 = B[bi + 0]; B1 = B[bi + 1]; bi += 2; A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); ai += 4; result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); } BLASLONG ci = n_top * ldc + m_top; vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); ci += ldc - gvl * 0; vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); ci = n_top * ldc + m_top; __riscv_vse64_v_f64m4(&C[ci], c0, gvl); ci += ldc - gvl * 0; __riscv_vse64_v_f64m4(&C[ci], c1, gvl); m_top += 4; } if (M & 2) { double result0 = 0; double result1 = 0; double result2 = 0; double result3 = 0; BLASLONG ai = m_top * K; BLASLONG bi = n_top * K; for (BLASLONG k = 0; k < K; k++) { result0 += A[ai + 0] * B[bi + 0]; result1 += A[ai + 1] * B[bi + 0]; result2 += A[ai + 0] * B[bi + 1]; result3 += A[ai + 1] * B[bi + 1]; ai += 2; bi += 2; } BLASLONG ci = n_top * ldc + m_top; C[ci + 0 * ldc + 0] += alpha * result0; C[ci + 0 * ldc + 1] += alpha * result1; C[ci + 1 * ldc + 0] += alpha * result2; C[ci + 1 * ldc + 1] += alpha * result3; m_top += 2; } if (M & 1) { double result0 = 0; double result1 = 0; BLASLONG ai = m_top * K; BLASLONG bi = n_top * K; for (BLASLONG k = 0; k < K; k++) { result0 += A[ai + 0] * B[bi + 0]; result1 += A[ai + 0] * B[bi + 1]; ai += 1; bi += 2; } BLASLONG ci = n_top * ldc + m_top; C[ci + 0 * ldc + 0] += alpha * result0; C[ci + 1 * ldc + 0] += alpha * result1; m_top += 1; } n_top += 2; } // -- tails for N=1 if (N & 1) { gvl = __riscv_vsetvl_e64m4(8); m_top = 0; for (BLASLONG i = 0; i < M / 8; i += 1) { BLASLONG ai = m_top * K; BLASLONG bi = n_top * K; double B0 = B[bi + 0]; bi += 1; vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); ai += 8; vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); for (BLASLONG k = 1; k < K; k++) { B0 = B[bi + 0]; bi += 1; A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); ai += 8; result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); } BLASLONG ci = n_top * ldc + m_top; vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); ci = n_top * ldc + m_top; __riscv_vse64_v_f64m4(&C[ci], c0, gvl); m_top += 8; } if (M & 4) { gvl = __riscv_vsetvl_e64m4(4); BLASLONG ai = m_top * K; BLASLONG bi = n_top * K; double B0 = B[bi + 0]; bi += 1; vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); ai += 4; vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); for (BLASLONG k = 1; k < K; k++) { B0 = B[bi + 0]; bi += 1; A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); ai += 4; result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); } BLASLONG ci = n_top * ldc + m_top; vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); ci = n_top * ldc + m_top; __riscv_vse64_v_f64m4(&C[ci], c0, gvl); m_top += 4; } if (M & 2) { double result0 = 0; double result1 = 0; BLASLONG ai = m_top * K; BLASLONG bi = n_top * K; for (BLASLONG k = 0; k < K; k++) { result0 += A[ai + 0] * B[bi + 0]; result1 += A[ai + 1] * B[bi + 0]; ai += 2; bi += 1; } BLASLONG ci = n_top * ldc + m_top; C[ci + 0 * ldc + 0] += alpha * result0; C[ci + 0 * ldc + 1] += alpha * result1; m_top += 2; } if (M & 1) { double result0 = 0; BLASLONG ai = m_top * K; BLASLONG bi = n_top * K; for (BLASLONG k = 0; k < K; k++) { result0 += A[ai + 0] * B[bi + 0]; ai += 1; bi += 1; } BLASLONG ci = n_top * ldc + m_top; C[ci + 0 * ldc + 0] += alpha * result0; m_top += 1; } n_top += 1; } return 0; }