/* AUTOGENERATED KERNEL Script: ./kernel/riscv64/generate_kernel.py Settings: LMUL=2 M=8 M_tail_scalar_from=2 N=4 __riscv_='__riscv_' complex=True conjugate=False cpu='zvl128b' force_acc_double=False index_type='BLASLONG' op='trmm' param_precision='float' reg_width_bits=128 tail_policy='' trace=False Derived: ELEN_ACC=32 ELEN_PARAM=32 LMUL_ACC=2 VFMACC='__riscv_vfmacc_vf_f32m2' VFMUL='__riscv_vfmul_vf_f32m2' VLEV='__riscv_vle32_v_f32m2' VLSEV='__riscv_vlse32_v_f32m2' VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' VSETVL='__riscv_vsetvl_e32m2' VSEV='__riscv_vse32_v_f32m2' VSSEV='__riscv_vsse32_v_f32m2' acc_vector_t='vfloat32m2_t' output='ctrmm_kernel_8x4_zvl128b.c' param_scalar_t='float' param_vector_t='vfloat32m2_t' */ #include "common.h" #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define S0 1 #define S1 -1 #define S2 1 #define S3 1 #define VFMACC_RR __riscv_vfmsac #define VFMACC_RI __riscv_vfmacc #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define S0 1 #define S1 1 #define S2 1 #define S3 -1 #define VFMACC_RR __riscv_vfmacc #define VFMACC_RI __riscv_vfmsac #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) #define S0 1 #define S1 1 #define S2 -1 #define S3 1 #define VFMACC_RR __riscv_vfmacc #define VFMACC_RI __riscv_vfnmsac #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) #define S0 1 #define S1 -1 #define S2 -1 #define S3 -1 #define VFMACC_RR __riscv_vfmsac #define VFMACC_RI __riscv_vfnmacc #endif #if defined(LEFT) != defined(TRANSA) #define BACKWARDS #endif int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) { BLASLONG gvl = 0; BLASLONG m_top = 0; BLASLONG n_top = 0; // -- MAIN PASS for (BLASLONG j = 0; j < N / 4; j += 1) { m_top = 0; BLASLONG gvl = __riscv_vsetvl_e32m2(8); for (BLASLONG i = 0; i < M / 8; i += 1) { BLASLONG ai = m_top * K * 2; BLASLONG bi = n_top * K * 2; BLASLONG pass_K = K; #ifdef LEFT BLASLONG off = offset + m_top; #else BLASLONG off = -offset + n_top; #endif #ifdef BACKWARDS ai += off * 8 * 2; bi += off * 4 * 2; pass_K -= off; #else #ifdef LEFT pass_K = off + 8; #else pass_K = off + 4; #endif #endif float B0r = B[bi + 0 * 2 + 0]; float B0i = B[bi + 0 * 2 + 1]; float B1r = B[bi + 1 * 2 + 0]; float B1i = B[bi + 1 * 2 + 1]; float B2r = B[bi + 2 * 2 + 0]; float B2i = B[bi + 2 * 2 + 1]; float B3r = B[bi + 3 * 2 + 0]; float B3i = B[bi + 3 * 2 + 1]; bi += 4 * 2; vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); ai += 8 * 2; // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k // leaving 6 vector registers for temporaries // performing 2 operations between reuses of temporaries vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); vfloat32m2_t ACC0r = tmp0r; vfloat32m2_t ACC0i = tmp0i; vfloat32m2_t ACC1r = tmp1r; vfloat32m2_t ACC1i = tmp1i; tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); vfloat32m2_t ACC2r = tmp0r; vfloat32m2_t ACC2i = tmp0i; vfloat32m2_t ACC3r = tmp1r; vfloat32m2_t ACC3i = tmp1i; for (BLASLONG k = 1; k < pass_K; k++) { B0r = B[bi + 0 * 2 + 0]; B0i = B[bi + 0 * 2 + 1]; B1r = B[bi + 1 * 2 + 0]; B1i = B[bi + 1 * 2 + 1]; B2r = B[bi + 2 * 2 + 0]; B2i = B[bi + 2 * 2 + 1]; B3r = B[bi + 3 * 2 + 0]; B3i = B[bi + 3 * 2 + 1]; bi += 4 * 2; A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); ai += 8 * 2; tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); } BLASLONG ci = n_top * ldc + m_top; vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); ci += ldc - gvl * 0; __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); ci += ldc - gvl * 0; __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); ci += ldc - gvl * 0; __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); m_top += 8; } // -- tails for main pass if (M & 4) { gvl = __riscv_vsetvl_e32m2(4); BLASLONG ai = m_top * K * 2; BLASLONG bi = n_top * K * 2; BLASLONG pass_K = K; #ifdef LEFT BLASLONG off = offset + m_top; #else BLASLONG off = -offset + n_top; #endif #ifdef BACKWARDS ai += off * 4 * 2; bi += off * 4 * 2; pass_K -= off; #else #ifdef LEFT pass_K = off + 4; #else pass_K = off + 4; #endif #endif float B0r = B[bi + 0 * 2 + 0]; float B0i = B[bi + 0 * 2 + 1]; float B1r = B[bi + 1 * 2 + 0]; float B1i = B[bi + 1 * 2 + 1]; float B2r = B[bi + 2 * 2 + 0]; float B2i = B[bi + 2 * 2 + 1]; float B3r = B[bi + 3 * 2 + 0]; float B3i = B[bi + 3 * 2 + 1]; bi += 4 * 2; vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); ai += 4 * 2; // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k // leaving 6 vector registers for temporaries // performing 2 operations between reuses of temporaries vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); vfloat32m2_t ACC0r = tmp0r; vfloat32m2_t ACC0i = tmp0i; vfloat32m2_t ACC1r = tmp1r; vfloat32m2_t ACC1i = tmp1i; tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); vfloat32m2_t ACC2r = tmp0r; vfloat32m2_t ACC2i = tmp0i; vfloat32m2_t ACC3r = tmp1r; vfloat32m2_t ACC3i = tmp1i; for (BLASLONG k = 1; k < pass_K; k++) { B0r = B[bi + 0 * 2 + 0]; B0i = B[bi + 0 * 2 + 1]; B1r = B[bi + 1 * 2 + 0]; B1i = B[bi + 1 * 2 + 1]; B2r = B[bi + 2 * 2 + 0]; B2i = B[bi + 2 * 2 + 1]; B3r = B[bi + 3 * 2 + 0]; B3i = B[bi + 3 * 2 + 1]; bi += 4 * 2; A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); ai += 4 * 2; tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); } BLASLONG ci = n_top * ldc + m_top; vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); ci += ldc - gvl * 0; __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); ci += ldc - gvl * 0; __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); ci += ldc - gvl * 0; __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); m_top += 4; } if (M & 2) { float result0 = 0; float result1 = 0; float result2 = 0; float result3 = 0; float result4 = 0; float result5 = 0; float result6 = 0; float result7 = 0; float result8 = 0; float result9 = 0; float result10 = 0; float result11 = 0; float result12 = 0; float result13 = 0; float result14 = 0; float result15 = 0; BLASLONG ai = m_top * K * 2; BLASLONG bi = n_top * K * 2; BLASLONG pass_K = K; #ifdef LEFT BLASLONG off = offset + m_top; #else BLASLONG off = -offset + n_top; #endif #ifdef BACKWARDS ai += off * 2 * 2; bi += off * 4 * 2; pass_K -= off; #else #ifdef LEFT pass_K = off + 2; #else pass_K = off + 4; #endif #endif for (BLASLONG k = 0; k < pass_K; k++) { result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; ai += 2 * 2; bi += 4 * 2; } BLASLONG ci = n_top * ldc + m_top; float Cr, Ci; Cr = result0 * alphar; Ci = result1 * alphar; Cr -= result1 * alphai; Ci += result0 * alphai; C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; Cr = result2 * alphar; Ci = result3 * alphar; Cr -= result3 * alphai; Ci += result2 * alphai; C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; Cr = result4 * alphar; Ci = result5 * alphar; Cr -= result5 * alphai; Ci += result4 * alphai; C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; Cr = result6 * alphar; Ci = result7 * alphar; Cr -= result7 * alphai; Ci += result6 * alphai; C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; Cr = result8 * alphar; Ci = result9 * alphar; Cr -= result9 * alphai; Ci += result8 * alphai; C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; Cr = result10 * alphar; Ci = result11 * alphar; Cr -= result11 * alphai; Ci += result10 * alphai; C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; Cr = result12 * alphar; Ci = result13 * alphar; Cr -= result13 * alphai; Ci += result12 * alphai; C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; Cr = result14 * alphar; Ci = result15 * alphar; Cr -= result15 * alphai; Ci += result14 * alphai; C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; m_top += 2; } if (M & 1) { float result0 = 0; float result1 = 0; float result2 = 0; float result3 = 0; float result4 = 0; float result5 = 0; float result6 = 0; float result7 = 0; BLASLONG ai = m_top * K * 2; BLASLONG bi = n_top * K * 2; BLASLONG pass_K = K; #ifdef LEFT BLASLONG off = offset + m_top; #else BLASLONG off = -offset + n_top; #endif #ifdef BACKWARDS ai += off * 1 * 2; bi += off * 4 * 2; pass_K -= off; #else #ifdef LEFT pass_K = off + 1; #else pass_K = off + 4; #endif #endif for (BLASLONG k = 0; k < pass_K; k++) { result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; ai += 1 * 2; bi += 4 * 2; } BLASLONG ci = n_top * ldc + m_top; float Cr, Ci; Cr = result0 * alphar; Ci = result1 * alphar; Cr -= result1 * alphai; Ci += result0 * alphai; C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; Cr = result2 * alphar; Ci = result3 * alphar; Cr -= result3 * alphai; Ci += result2 * alphai; C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; Cr = result4 * alphar; Ci = result5 * alphar; Cr -= result5 * alphai; Ci += result4 * alphai; C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; Cr = result6 * alphar; Ci = result7 * alphar; Cr -= result7 * alphai; Ci += result6 * alphai; C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; m_top += 1; } n_top += 4; } // -- tails for N=2 if (N & 2) { gvl = __riscv_vsetvl_e32m2(8); m_top = 0; for (BLASLONG i = 0; i < M / 8; i += 1) { BLASLONG ai = m_top * K * 2; BLASLONG bi = n_top * K * 2; BLASLONG pass_K = K; #ifdef LEFT BLASLONG off = offset + m_top; #else BLASLONG off = -offset + n_top; #endif #ifdef BACKWARDS ai += off * 8 * 2; bi += off * 2 * 2; pass_K -= off; #else #ifdef LEFT pass_K = off + 8; #else pass_K = off + 2; #endif #endif float B0r = B[bi + 0 * 2 + 0]; float B0i = B[bi + 0 * 2 + 1]; float B1r = B[bi + 1 * 2 + 0]; float B1i = B[bi + 1 * 2 + 1]; bi += 2 * 2; vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); ai += 8 * 2; // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k // leaving 10 vector registers for temporaries vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); vfloat32m2_t ACC0r = tmp0r; vfloat32m2_t ACC0i = tmp0i; vfloat32m2_t ACC1r = tmp1r; vfloat32m2_t ACC1i = tmp1i; for (BLASLONG k = 1; k < pass_K; k++) { B0r = B[bi + 0 * 2 + 0]; B0i = B[bi + 0 * 2 + 1]; B1r = B[bi + 1 * 2 + 0]; B1i = B[bi + 1 * 2 + 1]; bi += 2 * 2; A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); ai += 8 * 2; tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); } BLASLONG ci = n_top * ldc + m_top; vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); ci += ldc - gvl * 0; __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); m_top += 8; } if (M & 4) { gvl = __riscv_vsetvl_e32m2(4); BLASLONG ai = m_top * K * 2; BLASLONG bi = n_top * K * 2; BLASLONG pass_K = K; #ifdef LEFT BLASLONG off = offset + m_top; #else BLASLONG off = -offset + n_top; #endif #ifdef BACKWARDS ai += off * 4 * 2; bi += off * 2 * 2; pass_K -= off; #else #ifdef LEFT pass_K = off + 4; #else pass_K = off + 2; #endif #endif float B0r = B[bi + 0 * 2 + 0]; float B0i = B[bi + 0 * 2 + 1]; float B1r = B[bi + 1 * 2 + 0]; float B1i = B[bi + 1 * 2 + 1]; bi += 2 * 2; vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); ai += 4 * 2; // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k // leaving 10 vector registers for temporaries vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); vfloat32m2_t ACC0r = tmp0r; vfloat32m2_t ACC0i = tmp0i; vfloat32m2_t ACC1r = tmp1r; vfloat32m2_t ACC1i = tmp1i; for (BLASLONG k = 1; k < pass_K; k++) { B0r = B[bi + 0 * 2 + 0]; B0i = B[bi + 0 * 2 + 1]; B1r = B[bi + 1 * 2 + 0]; B1i = B[bi + 1 * 2 + 1]; bi += 2 * 2; A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); ai += 4 * 2; tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); } BLASLONG ci = n_top * ldc + m_top; vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); ci += ldc - gvl * 0; __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); m_top += 4; } if (M & 2) { float result0 = 0; float result1 = 0; float result2 = 0; float result3 = 0; float result4 = 0; float result5 = 0; float result6 = 0; float result7 = 0; BLASLONG ai = m_top * K * 2; BLASLONG bi = n_top * K * 2; BLASLONG pass_K = K; #ifdef LEFT BLASLONG off = offset + m_top; #else BLASLONG off = -offset + n_top; #endif #ifdef BACKWARDS ai += off * 2 * 2; bi += off * 2 * 2; pass_K -= off; #else #ifdef LEFT pass_K = off + 2; #else pass_K = off + 2; #endif #endif for (BLASLONG k = 0; k < pass_K; k++) { result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; ai += 2 * 2; bi += 2 * 2; } BLASLONG ci = n_top * ldc + m_top; float Cr, Ci; Cr = result0 * alphar; Ci = result1 * alphar; Cr -= result1 * alphai; Ci += result0 * alphai; C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; Cr = result2 * alphar; Ci = result3 * alphar; Cr -= result3 * alphai; Ci += result2 * alphai; C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; Cr = result4 * alphar; Ci = result5 * alphar; Cr -= result5 * alphai; Ci += result4 * alphai; C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; Cr = result6 * alphar; Ci = result7 * alphar; Cr -= result7 * alphai; Ci += result6 * alphai; C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; m_top += 2; } if (M & 1) { float result0 = 0; float result1 = 0; float result2 = 0; float result3 = 0; BLASLONG ai = m_top * K * 2; BLASLONG bi = n_top * K * 2; BLASLONG pass_K = K; #ifdef LEFT BLASLONG off = offset + m_top; #else BLASLONG off = -offset + n_top; #endif #ifdef BACKWARDS ai += off * 1 * 2; bi += off * 2 * 2; pass_K -= off; #else #ifdef LEFT pass_K = off + 1; #else pass_K = off + 2; #endif #endif for (BLASLONG k = 0; k < pass_K; k++) { result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; ai += 1 * 2; bi += 2 * 2; } BLASLONG ci = n_top * ldc + m_top; float Cr, Ci; Cr = result0 * alphar; Ci = result1 * alphar; Cr -= result1 * alphai; Ci += result0 * alphai; C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; Cr = result2 * alphar; Ci = result3 * alphar; Cr -= result3 * alphai; Ci += result2 * alphai; C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; m_top += 1; } n_top += 2; } // -- tails for N=1 if (N & 1) { gvl = __riscv_vsetvl_e32m2(8); m_top = 0; for (BLASLONG i = 0; i < M / 8; i += 1) { BLASLONG ai = m_top * K * 2; BLASLONG bi = n_top * K * 2; BLASLONG pass_K = K; #ifdef LEFT BLASLONG off = offset + m_top; #else BLASLONG off = -offset + n_top; #endif #ifdef BACKWARDS ai += off * 8 * 2; bi += off * 1 * 2; pass_K -= off; #else #ifdef LEFT pass_K = off + 8; #else pass_K = off + 1; #endif #endif float B0r = B[bi + 0 * 2 + 0]; float B0i = B[bi + 0 * 2 + 1]; bi += 1 * 2; vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); ai += 8 * 2; // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k // leaving 12 vector registers for temporaries vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); vfloat32m2_t ACC0r = tmp0r; vfloat32m2_t ACC0i = tmp0i; for (BLASLONG k = 1; k < pass_K; k++) { B0r = B[bi + 0 * 2 + 0]; B0i = B[bi + 0 * 2 + 1]; bi += 1 * 2; A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); ai += 8 * 2; tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); } BLASLONG ci = n_top * ldc + m_top; vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); m_top += 8; } if (M & 4) { gvl = __riscv_vsetvl_e32m2(4); BLASLONG ai = m_top * K * 2; BLASLONG bi = n_top * K * 2; BLASLONG pass_K = K; #ifdef LEFT BLASLONG off = offset + m_top; #else BLASLONG off = -offset + n_top; #endif #ifdef BACKWARDS ai += off * 4 * 2; bi += off * 1 * 2; pass_K -= off; #else #ifdef LEFT pass_K = off + 4; #else pass_K = off + 1; #endif #endif float B0r = B[bi + 0 * 2 + 0]; float B0i = B[bi + 0 * 2 + 1]; bi += 1 * 2; vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); ai += 4 * 2; // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k // leaving 12 vector registers for temporaries vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); vfloat32m2_t ACC0r = tmp0r; vfloat32m2_t ACC0i = tmp0i; for (BLASLONG k = 1; k < pass_K; k++) { B0r = B[bi + 0 * 2 + 0]; B0i = B[bi + 0 * 2 + 1]; bi += 1 * 2; A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); ai += 4 * 2; tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); } BLASLONG ci = n_top * ldc + m_top; vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); m_top += 4; } if (M & 2) { float result0 = 0; float result1 = 0; float result2 = 0; float result3 = 0; BLASLONG ai = m_top * K * 2; BLASLONG bi = n_top * K * 2; BLASLONG pass_K = K; #ifdef LEFT BLASLONG off = offset + m_top; #else BLASLONG off = -offset + n_top; #endif #ifdef BACKWARDS ai += off * 2 * 2; bi += off * 1 * 2; pass_K -= off; #else #ifdef LEFT pass_K = off + 2; #else pass_K = off + 1; #endif #endif for (BLASLONG k = 0; k < pass_K; k++) { result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; ai += 2 * 2; bi += 1 * 2; } BLASLONG ci = n_top * ldc + m_top; float Cr, Ci; Cr = result0 * alphar; Ci = result1 * alphar; Cr -= result1 * alphai; Ci += result0 * alphai; C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; Cr = result2 * alphar; Ci = result3 * alphar; Cr -= result3 * alphai; Ci += result2 * alphai; C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; m_top += 2; } if (M & 1) { float result0 = 0; float result1 = 0; BLASLONG ai = m_top * K * 2; BLASLONG bi = n_top * K * 2; BLASLONG pass_K = K; #ifdef LEFT BLASLONG off = offset + m_top; #else BLASLONG off = -offset + n_top; #endif #ifdef BACKWARDS ai += off * 1 * 2; bi += off * 1 * 2; pass_K -= off; #else #ifdef LEFT pass_K = off + 1; #else pass_K = off + 1; #endif #endif for (BLASLONG k = 0; k < pass_K; k++) { result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; ai += 1 * 2; bi += 1 * 2; } BLASLONG ci = n_top * ldc + m_top; float Cr, Ci; Cr = result0 * alphar; Ci = result1 * alphar; Cr -= result1 * alphai; Ci += result0 * alphai; C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; m_top += 1; } n_top += 1; } return 0; }