| @@ -63,6 +63,10 @@ ifeq ($(TARGET), RISCV64_ZVL256B) | |||||
| TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d | TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d | ||||
| endif | endif | ||||
| ifeq ($(TARGET), RISCV64_ZVL128B) | |||||
| TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d | |||||
| endif | |||||
| ifeq ($(TARGET), RISCV64_GENERIC) | ifeq ($(TARGET), RISCV64_GENERIC) | ||||
| TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d | TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d | ||||
| endif | endif | ||||
| @@ -10,6 +10,10 @@ ifeq ($(CORE), RISCV64_ZVL256B) | |||||
| CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl256b -mabi=lp64d | CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl256b -mabi=lp64d | ||||
| FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | ||||
| endif | endif | ||||
| ifeq ($(CORE), RISCV64_ZVL128B) | |||||
| CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d | |||||
| FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static | |||||
| endif | |||||
| ifeq ($(CORE), RISCV64_GENERIC) | ifeq ($(CORE), RISCV64_GENERIC) | ||||
| CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d | CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d | ||||
| FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static | FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static | ||||
| @@ -119,6 +119,7 @@ Z14 | |||||
| 10.RISC-V 64: | 10.RISC-V 64: | ||||
| RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) | RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) | ||||
| RISCV64_ZVL128B | |||||
| C910V | C910V | ||||
| x280 | x280 | ||||
| RISCV64_ZVL256B | RISCV64_ZVL256B | ||||
| @@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define BUFFER_SIZE ( 32 << 20) | #define BUFFER_SIZE ( 32 << 20) | ||||
| #define SEEK_ADDRESS | #define SEEK_ADDRESS | ||||
| #if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) | |||||
| #if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) | |||||
| # include <riscv_vector.h> | # include <riscv_vector.h> | ||||
| #endif | #endif | ||||
| @@ -74,12 +74,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CPU_C910V 1 | #define CPU_C910V 1 | ||||
| #define CPU_x280 2 | #define CPU_x280 2 | ||||
| #define CPU_RISCV64_ZVL256B 3 | #define CPU_RISCV64_ZVL256B 3 | ||||
| #define CPU_RISCV64_ZVL128B 4 | |||||
| static char *cpuname[] = { | static char *cpuname[] = { | ||||
| "RISCV64_GENERIC", | "RISCV64_GENERIC", | ||||
| "C910V", | "C910V", | ||||
| "x280", | "x280", | ||||
| "CPU_RISCV64_ZVL256B" | |||||
| "CPU_RISCV64_ZVL256B", | |||||
| "CPU_RISCV64_ZVL128B" | |||||
| }; | }; | ||||
| int detect(void){ | int detect(void){ | ||||
| @@ -1703,9 +1703,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | ||||
| #define LIBNAME "riscv64_zvl256b" | #define LIBNAME "riscv64_zvl256b" | ||||
| #define CORENAME "RISCV64_ZVL256B" | #define CORENAME "RISCV64_ZVL256B" | ||||
| #else | |||||
| #endif | #endif | ||||
| #ifdef FORCE_RISCV64_ZVL128B | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "RISCV64" | |||||
| #define SUBARCHITECTURE "RISCV64_ZVL128B" | |||||
| #define SUBDIRNAME "riscv64" | |||||
| #define ARCHCONFIG "-DRISCV64_ZVL128B " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||||
| #define LIBNAME "riscv64_zvl128b" | |||||
| #define CORENAME "RISCV64_ZVL128B" | |||||
| #endif | |||||
| #if defined(FORCE_E2K) || defined(__e2k__) | #if defined(FORCE_E2K) || defined(__e2k__) | ||||
| #define FORCE | #define FORCE | ||||
| @@ -0,0 +1,243 @@ | |||||
| SAMAXKERNEL = amax_rvv.c | |||||
| DAMAXKERNEL = amax_rvv.c | |||||
| CAMAXKERNEL = zamax_rvv.c | |||||
| ZAMAXKERNEL = zamax_rvv.c | |||||
| SAMINKERNEL = amin_rvv.c | |||||
| DAMINKERNEL = amin_rvv.c | |||||
| CAMINKERNEL = zamin_rvv.c | |||||
| ZAMINKERNEL = zamin_rvv.c | |||||
| SMAXKERNEL = max_rvv.c | |||||
| DMAXKERNEL = max_rvv.c | |||||
| SMINKERNEL = min_rvv.c | |||||
| DMINKERNEL = min_rvv.c | |||||
| ISAMAXKERNEL = iamax_rvv.c | |||||
| IDAMAXKERNEL = iamax_rvv.c | |||||
| ICAMAXKERNEL = izamax_rvv.c | |||||
| IZAMAXKERNEL = izamax_rvv.c | |||||
| ISAMINKERNEL = iamin_rvv.c | |||||
| IDAMINKERNEL = iamin_rvv.c | |||||
| ICAMINKERNEL = izamin_rvv.c | |||||
| IZAMINKERNEL = izamin_rvv.c | |||||
| ISMAXKERNEL = imax_rvv.c | |||||
| IDMAXKERNEL = imax_rvv.c | |||||
| ISMINKERNEL = imin_rvv.c | |||||
| IDMINKERNEL = imin_rvv.c | |||||
| SASUMKERNEL = asum_rvv.c | |||||
| DASUMKERNEL = asum_rvv.c | |||||
| CASUMKERNEL = zasum_rvv.c | |||||
| ZASUMKERNEL = zasum_rvv.c | |||||
| SSUMKERNEL = sum_rvv.c | |||||
| DSUMKERNEL = sum_rvv.c | |||||
| CSUMKERNEL = zsum_rvv.c | |||||
| ZSUMKERNEL = zsum_rvv.c | |||||
| SAXPYKERNEL = axpy_rvv.c | |||||
| DAXPYKERNEL = axpy_rvv.c | |||||
| CAXPYKERNEL = zaxpy_rvv.c | |||||
| ZAXPYKERNEL = zaxpy_rvv.c | |||||
| SAXPBYKERNEL = axpby_rvv.c | |||||
| DAXPBYKERNEL = axpby_rvv.c | |||||
| CAXPBYKERNEL = zaxpby_rvv.c | |||||
| ZAXPBYKERNEL = zaxpby_rvv.c | |||||
| SCOPYKERNEL = copy_rvv.c | |||||
| DCOPYKERNEL = copy_rvv.c | |||||
| CCOPYKERNEL = zcopy_rvv.c | |||||
| ZCOPYKERNEL = zcopy_rvv.c | |||||
| SDOTKERNEL = dot_rvv.c | |||||
| DDOTKERNEL = dot_rvv.c | |||||
| CDOTKERNEL = zdot_rvv.c | |||||
| ZDOTKERNEL = zdot_rvv.c | |||||
| DSDOTKERNEL = dot_rvv.c | |||||
| SNRM2KERNEL = nrm2_rvv.c | |||||
| DNRM2KERNEL = nrm2_rvv.c | |||||
| CNRM2KERNEL = znrm2_rvv.c | |||||
| ZNRM2KERNEL = znrm2_rvv.c | |||||
| SROTKERNEL = rot_rvv.c | |||||
| DROTKERNEL = rot_rvv.c | |||||
| CROTKERNEL = zrot_rvv.c | |||||
| ZROTKERNEL = zrot_rvv.c | |||||
| SSCALKERNEL = scal_rvv.c | |||||
| DSCALKERNEL = scal_rvv.c | |||||
| CSCALKERNEL = zscal_rvv.c | |||||
| ZSCALKERNEL = zscal_rvv.c | |||||
| SSWAPKERNEL = swap_rvv.c | |||||
| DSWAPKERNEL = swap_rvv.c | |||||
| CSWAPKERNEL = zswap_rvv.c | |||||
| ZSWAPKERNEL = zswap_rvv.c | |||||
| SGEMVNKERNEL = gemv_n_rvv.c | |||||
| DGEMVNKERNEL = gemv_n_rvv.c | |||||
| CGEMVNKERNEL = zgemv_n_rvv.c | |||||
| ZGEMVNKERNEL = zgemv_n_rvv.c | |||||
| SGEMVTKERNEL = gemv_t_rvv.c | |||||
| DGEMVTKERNEL = gemv_t_rvv.c | |||||
| CGEMVTKERNEL = zgemv_t_rvv.c | |||||
| ZGEMVTKERNEL = zgemv_t_rvv.c | |||||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c | |||||
| STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c | |||||
| STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c | |||||
| STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c | |||||
| STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c | |||||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c | |||||
| DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c | |||||
| DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c | |||||
| DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c | |||||
| DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c | |||||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c | |||||
| CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c | |||||
| CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | |||||
| CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c | |||||
| CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | |||||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c | |||||
| ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c | |||||
| ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c | |||||
| ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c | |||||
| ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| SSYMV_U_KERNEL = symv_U_rvv.c | |||||
| SSYMV_L_KERNEL = symv_L_rvv.c | |||||
| DSYMV_U_KERNEL = symv_U_rvv.c | |||||
| DSYMV_L_KERNEL = symv_L_rvv.c | |||||
| CSYMV_U_KERNEL = zsymv_U_rvv.c | |||||
| CSYMV_L_KERNEL = zsymv_L_rvv.c | |||||
| ZSYMV_U_KERNEL = zsymv_U_rvv.c | |||||
| ZSYMV_L_KERNEL = zsymv_L_rvv.c | |||||
| CHEMV_L_KERNEL = zhemv_LM_rvv.c | |||||
| CHEMV_M_KERNEL = zhemv_LM_rvv.c | |||||
| CHEMV_U_KERNEL = zhemv_UV_rvv.c | |||||
| CHEMV_V_KERNEL = zhemv_UV_rvv.c | |||||
| ZHEMV_L_KERNEL = zhemv_LM_rvv.c | |||||
| ZHEMV_M_KERNEL = zhemv_LM_rvv.c | |||||
| ZHEMV_U_KERNEL = zhemv_UV_rvv.c | |||||
| ZHEMV_V_KERNEL = zhemv_UV_rvv.c | |||||
| SSYMMUCOPY_M = ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c | |||||
| SSYMMLCOPY_M = ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c | |||||
| DSYMMUCOPY_M = ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c | |||||
| DSYMMLCOPY_M = ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c | |||||
| CSYMMUCOPY_M = ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c | |||||
| CSYMMLCOPY_M = ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c | |||||
| ZSYMMUCOPY_M = ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c | |||||
| ZSYMMLCOPY_M = ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c | |||||
| CHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c | |||||
| CHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c | |||||
| ZHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c | |||||
| ZHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c | |||||
| LSAME_KERNEL = ../generic/lsame.c | |||||
| SCABS_KERNEL = ../generic/cabs.c | |||||
| DCABS_KERNEL = ../generic/cabs.c | |||||
| QCABS_KERNEL = ../generic/cabs.c | |||||
| ifndef SGEMM_BETA | |||||
| SGEMM_BETA = gemm_beta_rvv.c | |||||
| endif | |||||
| ifndef DGEMM_BETA | |||||
| DGEMM_BETA = gemm_beta_rvv.c | |||||
| endif | |||||
| ifndef CGEMM_BETA | |||||
| CGEMM_BETA = zgemm_beta_rvv.c | |||||
| endif | |||||
| ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = zgemm_beta_rvv.c | |||||
| endif | |||||
| @@ -0,0 +1,996 @@ | |||||
| /* | |||||
| AUTOGENERATED KERNEL | |||||
| Script: ./kernel/riscv64/generate_kernel.py | |||||
| Settings: | |||||
| LMUL=2 | |||||
| M=8 | |||||
| M_tail_scalar_from=2 | |||||
| N=4 | |||||
| __riscv_='__riscv_' | |||||
| complex=True | |||||
| conjugate=False | |||||
| cpu='zvl128b' | |||||
| force_acc_double=False | |||||
| index_type='BLASLONG' | |||||
| op='gemm' | |||||
| param_precision='float' | |||||
| reg_width_bits=128 | |||||
| tail_policy='' | |||||
| trace=False | |||||
| Derived: | |||||
| ELEN_ACC=32 | |||||
| ELEN_PARAM=32 | |||||
| LMUL_ACC=2 | |||||
| VFMACC='__riscv_vfmacc_vf_f32m2' | |||||
| VFMUL='__riscv_vfmul_vf_f32m2' | |||||
| VLEV='__riscv_vle32_v_f32m2' | |||||
| VLSEV='__riscv_vlse32_v_f32m2' | |||||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' | |||||
| VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' | |||||
| VSETVL='__riscv_vsetvl_e32m2' | |||||
| VSEV='__riscv_vse32_v_f32m2' | |||||
| VSSEV='__riscv_vsse32_v_f32m2' | |||||
| acc_vector_t='vfloat32m2_t' | |||||
| output='cgemm_kernel_8x4_zvl128b.c' | |||||
| param_scalar_t='float' | |||||
| param_vector_t='vfloat32m2_t' | |||||
| */ | |||||
| #include "common.h" | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| #define S0 1 | |||||
| #define S1 -1 | |||||
| #define S2 1 | |||||
| #define S3 1 | |||||
| #define VFMACC_RR __riscv_vfmsac | |||||
| #define VFMACC_RI __riscv_vfmacc | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| #define S0 1 | |||||
| #define S1 1 | |||||
| #define S2 1 | |||||
| #define S3 -1 | |||||
| #define VFMACC_RR __riscv_vfmacc | |||||
| #define VFMACC_RI __riscv_vfmsac | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| #define S0 1 | |||||
| #define S1 1 | |||||
| #define S2 -1 | |||||
| #define S3 1 | |||||
| #define VFMACC_RR __riscv_vfmacc | |||||
| #define VFMACC_RI __riscv_vfnmsac | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| #define S0 1 | |||||
| #define S1 -1 | |||||
| #define S2 -1 | |||||
| #define S3 -1 | |||||
| #define VFMACC_RR __riscv_vfmsac | |||||
| #define VFMACC_RI __riscv_vfnmacc | |||||
| #endif | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) | |||||
| { | |||||
| BLASLONG gvl = 0; | |||||
| BLASLONG m_top = 0; | |||||
| BLASLONG n_top = 0; | |||||
| // -- MAIN PASS | |||||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||||
| m_top = 0; | |||||
| BLASLONG gvl = __riscv_vsetvl_e32m2(8); | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| float B0r = B[bi + 0 * 2 + 0]; | |||||
| float B0i = B[bi + 0 * 2 + 1]; | |||||
| float B1r = B[bi + 1 * 2 + 0]; | |||||
| float B1i = B[bi + 1 * 2 + 1]; | |||||
| float B2r = B[bi + 2 * 2 + 0]; | |||||
| float B2i = B[bi + 2 * 2 + 1]; | |||||
| float B3r = B[bi + 3 * 2 + 0]; | |||||
| float B3i = B[bi + 3 * 2 + 1]; | |||||
| bi += 4 * 2; | |||||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 8 * 2; | |||||
| // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k | |||||
| // leaving 6 vector registers for temporaries | |||||
| // performing 2 operations between reuses of temporaries | |||||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| vfloat32m2_t ACC0r = tmp0r; | |||||
| vfloat32m2_t ACC0i = tmp0i; | |||||
| vfloat32m2_t ACC1r = tmp1r; | |||||
| vfloat32m2_t ACC1i = tmp1i; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||||
| vfloat32m2_t ACC2r = tmp0r; | |||||
| vfloat32m2_t ACC2i = tmp0i; | |||||
| vfloat32m2_t ACC3r = tmp1r; | |||||
| vfloat32m2_t ACC3i = tmp1i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| B1r = B[bi + 1 * 2 + 0]; | |||||
| B1i = B[bi + 1 * 2 + 1]; | |||||
| B2r = B[bi + 2 * 2 + 0]; | |||||
| B2i = B[bi + 2 * 2 + 1]; | |||||
| B3r = B[bi + 3 * 2 + 0]; | |||||
| B3i = B[bi + 3 * 2 + 1]; | |||||
| bi += 4 * 2; | |||||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 8 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||||
| ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); | |||||
| ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); | |||||
| ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); | |||||
| ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||||
| C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); | |||||
| C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); | |||||
| C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); | |||||
| C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||||
| C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); | |||||
| C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); | |||||
| C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); | |||||
| C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| // -- tails for main pass | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| float B0r = B[bi + 0 * 2 + 0]; | |||||
| float B0i = B[bi + 0 * 2 + 1]; | |||||
| float B1r = B[bi + 1 * 2 + 0]; | |||||
| float B1i = B[bi + 1 * 2 + 1]; | |||||
| float B2r = B[bi + 2 * 2 + 0]; | |||||
| float B2i = B[bi + 2 * 2 + 1]; | |||||
| float B3r = B[bi + 3 * 2 + 0]; | |||||
| float B3i = B[bi + 3 * 2 + 1]; | |||||
| bi += 4 * 2; | |||||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k | |||||
| // leaving 6 vector registers for temporaries | |||||
| // performing 2 operations between reuses of temporaries | |||||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| vfloat32m2_t ACC0r = tmp0r; | |||||
| vfloat32m2_t ACC0i = tmp0i; | |||||
| vfloat32m2_t ACC1r = tmp1r; | |||||
| vfloat32m2_t ACC1i = tmp1i; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||||
| vfloat32m2_t ACC2r = tmp0r; | |||||
| vfloat32m2_t ACC2i = tmp0i; | |||||
| vfloat32m2_t ACC3r = tmp1r; | |||||
| vfloat32m2_t ACC3i = tmp1i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| B1r = B[bi + 1 * 2 + 0]; | |||||
| B1i = B[bi + 1 * 2 + 1]; | |||||
| B2r = B[bi + 2 * 2 + 0]; | |||||
| B2i = B[bi + 2 * 2 + 1]; | |||||
| B3r = B[bi + 3 * 2 + 0]; | |||||
| B3i = B[bi + 3 * 2 + 1]; | |||||
| bi += 4 * 2; | |||||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||||
| ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); | |||||
| ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); | |||||
| ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); | |||||
| ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||||
| C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); | |||||
| C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); | |||||
| C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); | |||||
| C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||||
| C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); | |||||
| C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); | |||||
| C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); | |||||
| C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| float result4 = 0; | |||||
| float result5 = 0; | |||||
| float result6 = 0; | |||||
| float result7 = 0; | |||||
| float result8 = 0; | |||||
| float result9 = 0; | |||||
| float result10 = 0; | |||||
| float result11 = 0; | |||||
| float result12 = 0; | |||||
| float result13 = 0; | |||||
| float result14 = 0; | |||||
| float result15 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; | |||||
| result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; | |||||
| result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; | |||||
| result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; | |||||
| result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; | |||||
| result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; | |||||
| result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; | |||||
| result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; | |||||
| result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; | |||||
| result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; | |||||
| ai += 2 * 2; | |||||
| bi += 4 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| float Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||||
| Cr += result4 * alphar; | |||||
| Ci += result5 * alphar; | |||||
| Cr -= result5 * alphai; | |||||
| Ci += result4 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; | |||||
| Cr += result6 * alphar; | |||||
| Ci += result7 * alphar; | |||||
| Cr -= result7 * alphai; | |||||
| Ci += result6 * alphai; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; | |||||
| Cr += result8 * alphar; | |||||
| Ci += result9 * alphar; | |||||
| Cr -= result9 * alphai; | |||||
| Ci += result8 * alphai; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; | |||||
| Cr += result10 * alphar; | |||||
| Ci += result11 * alphar; | |||||
| Cr -= result11 * alphai; | |||||
| Ci += result10 * alphai; | |||||
| C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; | |||||
| Cr += result12 * alphar; | |||||
| Ci += result13 * alphar; | |||||
| Cr -= result13 * alphai; | |||||
| Ci += result12 * alphai; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; | |||||
| Cr += result14 * alphar; | |||||
| Ci += result15 * alphar; | |||||
| Cr -= result15 * alphai; | |||||
| Ci += result14 * alphai; | |||||
| C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| float result4 = 0; | |||||
| float result5 = 0; | |||||
| float result6 = 0; | |||||
| float result7 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; | |||||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; | |||||
| result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; | |||||
| result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; | |||||
| ai += 1 * 2; | |||||
| bi += 4 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| float Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; | |||||
| Cr += result4 * alphar; | |||||
| Ci += result5 * alphar; | |||||
| Cr -= result5 * alphai; | |||||
| Ci += result4 * alphai; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; | |||||
| Cr += result6 * alphar; | |||||
| Ci += result7 * alphar; | |||||
| Cr -= result7 * alphai; | |||||
| Ci += result6 * alphai; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 4; | |||||
| } | |||||
| // -- tails for N=2 | |||||
| if (N & 2) { | |||||
| gvl = __riscv_vsetvl_e32m2(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| float B0r = B[bi + 0 * 2 + 0]; | |||||
| float B0i = B[bi + 0 * 2 + 1]; | |||||
| float B1r = B[bi + 1 * 2 + 0]; | |||||
| float B1i = B[bi + 1 * 2 + 1]; | |||||
| bi += 2 * 2; | |||||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 8 * 2; | |||||
| // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k | |||||
| // leaving 10 vector registers for temporaries | |||||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| vfloat32m2_t ACC0r = tmp0r; | |||||
| vfloat32m2_t ACC0i = tmp0i; | |||||
| vfloat32m2_t ACC1r = tmp1r; | |||||
| vfloat32m2_t ACC1i = tmp1i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| B1r = B[bi + 1 * 2 + 0]; | |||||
| B1i = B[bi + 1 * 2 + 1]; | |||||
| bi += 2 * 2; | |||||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 8 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| float B0r = B[bi + 0 * 2 + 0]; | |||||
| float B0i = B[bi + 0 * 2 + 1]; | |||||
| float B1r = B[bi + 1 * 2 + 0]; | |||||
| float B1i = B[bi + 1 * 2 + 1]; | |||||
| bi += 2 * 2; | |||||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k | |||||
| // leaving 10 vector registers for temporaries | |||||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| vfloat32m2_t ACC0r = tmp0r; | |||||
| vfloat32m2_t ACC0i = tmp0i; | |||||
| vfloat32m2_t ACC1r = tmp1r; | |||||
| vfloat32m2_t ACC1i = tmp1i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| B1r = B[bi + 1 * 2 + 0]; | |||||
| B1i = B[bi + 1 * 2 + 1]; | |||||
| bi += 2 * 2; | |||||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| float result4 = 0; | |||||
| float result5 = 0; | |||||
| float result6 = 0; | |||||
| float result7 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; | |||||
| result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; | |||||
| ai += 2 * 2; | |||||
| bi += 2 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| float Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||||
| Cr += result4 * alphar; | |||||
| Ci += result5 * alphar; | |||||
| Cr -= result5 * alphai; | |||||
| Ci += result4 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; | |||||
| Cr += result6 * alphar; | |||||
| Ci += result7 * alphar; | |||||
| Cr -= result7 * alphai; | |||||
| Ci += result6 * alphai; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| ai += 1 * 2; | |||||
| bi += 2 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| float Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 2; | |||||
| } | |||||
| // -- tails for N=1 | |||||
| if (N & 1) { | |||||
| gvl = __riscv_vsetvl_e32m2(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| float B0r = B[bi + 0 * 2 + 0]; | |||||
| float B0i = B[bi + 0 * 2 + 1]; | |||||
| bi += 1 * 2; | |||||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 8 * 2; | |||||
| // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k | |||||
| // leaving 12 vector registers for temporaries | |||||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| vfloat32m2_t ACC0r = tmp0r; | |||||
| vfloat32m2_t ACC0i = tmp0i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| bi += 1 * 2; | |||||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 8 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| float B0r = B[bi + 0 * 2 + 0]; | |||||
| float B0i = B[bi + 0 * 2 + 1]; | |||||
| bi += 1 * 2; | |||||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k | |||||
| // leaving 12 vector registers for temporaries | |||||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| vfloat32m2_t ACC0r = tmp0r; | |||||
| vfloat32m2_t ACC0i = tmp0i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| bi += 1 * 2; | |||||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||||
| ai += 2 * 2; | |||||
| bi += 1 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| float Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| ai += 1 * 2; | |||||
| bi += 1 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| float Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 1; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,492 @@ | |||||
| /* | |||||
| AUTOGENERATED KERNEL | |||||
| Script: ./kernel/riscv64/generate_kernel.py | |||||
| Settings: | |||||
| LMUL=4 | |||||
| M=8 | |||||
| M_tail_scalar_from=2 | |||||
| N=4 | |||||
| __riscv_='__riscv_' | |||||
| complex=False | |||||
| conjugate=False | |||||
| cpu='zvl128b' | |||||
| force_acc_double=False | |||||
| index_type='BLASLONG' | |||||
| op='gemm' | |||||
| param_precision='double' | |||||
| reg_width_bits=128 | |||||
| tail_policy='' | |||||
| trace=False | |||||
| Derived: | |||||
| ELEN_ACC=64 | |||||
| ELEN_PARAM=64 | |||||
| LMUL_ACC=4 | |||||
| VFMACC='__riscv_vfmacc_vf_f64m4' | |||||
| VFMUL='__riscv_vfmul_vf_f64m4' | |||||
| VLEV='__riscv_vle64_v_f64m4' | |||||
| VLSEV='__riscv_vlse64_v_f64m4' | |||||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' | |||||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' | |||||
| VSETVL='__riscv_vsetvl_e64m4' | |||||
| VSEV='__riscv_vse64_v_f64m4' | |||||
| VSSEV='__riscv_vsse64_v_f64m4' | |||||
| acc_vector_t='vfloat64m4_t' | |||||
| output='dgemm_kernel_8x4_zvl128b.c' | |||||
| param_scalar_t='double' | |||||
| param_vector_t='vfloat64m4_t' | |||||
| */ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) | |||||
| { | |||||
| BLASLONG gvl = 0; | |||||
| BLASLONG m_top = 0; | |||||
| BLASLONG n_top = 0; | |||||
| // -- MAIN PASS | |||||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||||
| m_top = 0; | |||||
| BLASLONG gvl = __riscv_vsetvl_e64m4(8); | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| double B2 = B[bi + 2]; | |||||
| double B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||||
| c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); | |||||
| c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| // -- tails for main pass | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e64m4(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| double B2 = B[bi + 2]; | |||||
| double B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||||
| c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); | |||||
| c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| result4 += A[ai + 0] * B[bi + 2]; | |||||
| result5 += A[ai + 1] * B[bi + 2]; | |||||
| result6 += A[ai + 0] * B[bi + 3]; | |||||
| result7 += A[ai + 1] * B[bi + 3]; | |||||
| ai += 2; | |||||
| bi += 4; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||||
| C[ci + 2 * ldc + 0] += alpha * result4; | |||||
| C[ci + 2 * ldc + 1] += alpha * result5; | |||||
| C[ci + 3 * ldc + 0] += alpha * result6; | |||||
| C[ci + 3 * ldc + 1] += alpha * result7; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| result2 += A[ai + 0] * B[bi + 2]; | |||||
| result3 += A[ai + 0] * B[bi + 3]; | |||||
| ai += 1; | |||||
| bi += 4; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||||
| C[ci + 2 * ldc + 0] += alpha * result2; | |||||
| C[ci + 3 * ldc + 0] += alpha * result3; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 4; | |||||
| } | |||||
| // -- tails for N=2 | |||||
| if (N & 2) { | |||||
| gvl = __riscv_vsetvl_e64m4(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e64m4(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| ai += 2; | |||||
| bi += 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| ai += 1; | |||||
| bi += 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 2; | |||||
| } | |||||
| // -- tails for N=1 | |||||
| if (N & 1) { | |||||
| gvl = __riscv_vsetvl_e64m4(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| double B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e64m4(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| double B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| ai += 2; | |||||
| bi += 1; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| ai += 1; | |||||
| bi += 1; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 1; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,660 @@ | |||||
| /* | |||||
| AUTOGENERATED KERNEL | |||||
| Script: ./kernel/riscv64/generate_kernel.py | |||||
| Settings: | |||||
| LMUL=4 | |||||
| M=8 | |||||
| M_tail_scalar_from=2 | |||||
| N=4 | |||||
| __riscv_='__riscv_' | |||||
| complex=False | |||||
| conjugate=False | |||||
| cpu='zvl128b' | |||||
| force_acc_double=False | |||||
| index_type='BLASLONG' | |||||
| op='trmm' | |||||
| param_precision='double' | |||||
| reg_width_bits=128 | |||||
| tail_policy='' | |||||
| trace=False | |||||
| Derived: | |||||
| ELEN_ACC=64 | |||||
| ELEN_PARAM=64 | |||||
| LMUL_ACC=4 | |||||
| VFMACC='__riscv_vfmacc_vf_f64m4' | |||||
| VFMUL='__riscv_vfmul_vf_f64m4' | |||||
| VLEV='__riscv_vle64_v_f64m4' | |||||
| VLSEV='__riscv_vlse64_v_f64m4' | |||||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' | |||||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' | |||||
| VSETVL='__riscv_vsetvl_e64m4' | |||||
| VSEV='__riscv_vse64_v_f64m4' | |||||
| VSSEV='__riscv_vsse64_v_f64m4' | |||||
| acc_vector_t='vfloat64m4_t' | |||||
| output='dtrmm_kernel_8x4_zvl128b.c' | |||||
| param_scalar_t='double' | |||||
| param_vector_t='vfloat64m4_t' | |||||
| */ | |||||
| #include "common.h" | |||||
| #if defined(LEFT) != defined(TRANSA) | |||||
| #define BACKWARDS | |||||
| #endif | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) | |||||
| { | |||||
| BLASLONG gvl = 0; | |||||
| BLASLONG m_top = 0; | |||||
| BLASLONG n_top = 0; | |||||
| // -- MAIN PASS | |||||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||||
| m_top = 0; | |||||
| BLASLONG gvl = __riscv_vsetvl_e64m4(8); | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 8; | |||||
| bi += off * 4; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 8; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| double B2 = B[bi + 2]; | |||||
| double B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||||
| vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); | |||||
| vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| // -- tails for main pass | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e64m4(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4; | |||||
| bi += off * 4; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| double B2 = B[bi + 2]; | |||||
| double B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||||
| vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); | |||||
| vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2; | |||||
| bi += off * 4; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| result4 += A[ai + 0] * B[bi + 2]; | |||||
| result5 += A[ai + 1] * B[bi + 2]; | |||||
| result6 += A[ai + 0] * B[bi + 3]; | |||||
| result7 += A[ai + 1] * B[bi + 3]; | |||||
| ai += 2; | |||||
| bi += 4; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||||
| C[ci + 2 * ldc + 0] = alpha * result4; | |||||
| C[ci + 2 * ldc + 1] = alpha * result5; | |||||
| C[ci + 3 * ldc + 0] = alpha * result6; | |||||
| C[ci + 3 * ldc + 1] = alpha * result7; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1; | |||||
| bi += off * 4; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| result2 += A[ai + 0] * B[bi + 2]; | |||||
| result3 += A[ai + 0] * B[bi + 3]; | |||||
| ai += 1; | |||||
| bi += 4; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||||
| C[ci + 2 * ldc + 0] = alpha * result2; | |||||
| C[ci + 3 * ldc + 0] = alpha * result3; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 4; | |||||
| } | |||||
| // -- tails for N=2 | |||||
| if (N & 2) { | |||||
| gvl = __riscv_vsetvl_e64m4(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 8; | |||||
| bi += off * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 8; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e64m4(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4; | |||||
| bi += off * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2; | |||||
| bi += off * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| ai += 2; | |||||
| bi += 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1; | |||||
| bi += off * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| ai += 1; | |||||
| bi += 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 2; | |||||
| } | |||||
| // -- tails for N=1 | |||||
| if (N & 1) { | |||||
| gvl = __riscv_vsetvl_e64m4(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 8; | |||||
| bi += off * 1; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 8; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| double B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e64m4(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4; | |||||
| bi += off * 1; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| double B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2; | |||||
| bi += off * 1; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| ai += 2; | |||||
| bi += 1; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1; | |||||
| bi += off * 1; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| ai += 1; | |||||
| bi += 1; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 1; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,791 @@ | |||||
| /* | |||||
| AUTOGENERATED KERNEL | |||||
| Script: ./kernel/riscv64/generate_kernel.py | |||||
| Settings: | |||||
| LMUL=2 | |||||
| M=8 | |||||
| M_tail_scalar_from=2 | |||||
| N=8 | |||||
| __riscv_='__riscv_' | |||||
| complex=False | |||||
| conjugate=False | |||||
| cpu='zvl128b' | |||||
| force_acc_double=False | |||||
| index_type='BLASLONG' | |||||
| op='gemm' | |||||
| param_precision='float' | |||||
| reg_width_bits=128 | |||||
| tail_policy='' | |||||
| trace=False | |||||
| Derived: | |||||
| ELEN_ACC=32 | |||||
| ELEN_PARAM=32 | |||||
| LMUL_ACC=2 | |||||
| VFMACC='__riscv_vfmacc_vf_f32m2' | |||||
| VFMUL='__riscv_vfmul_vf_f32m2' | |||||
| VLEV='__riscv_vle32_v_f32m2' | |||||
| VLSEV='__riscv_vlse32_v_f32m2' | |||||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' | |||||
| VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' | |||||
| VSETVL='__riscv_vsetvl_e32m2' | |||||
| VSEV='__riscv_vse32_v_f32m2' | |||||
| VSSEV='__riscv_vsse32_v_f32m2' | |||||
| acc_vector_t='vfloat32m2_t' | |||||
| output='sgemm_kernel_8x8_zvl128b.c' | |||||
| param_scalar_t='float' | |||||
| param_vector_t='vfloat32m2_t' | |||||
| */ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) | |||||
| { | |||||
| BLASLONG gvl = 0; | |||||
| BLASLONG m_top = 0; | |||||
| BLASLONG n_top = 0; | |||||
| // -- MAIN PASS | |||||
| for (BLASLONG j = 0; j < N / 8; j += 1) { | |||||
| m_top = 0; | |||||
| BLASLONG gvl = __riscv_vsetvl_e32m2(8); | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| float B0 = B[bi + 0]; | |||||
| float B1 = B[bi + 1]; | |||||
| float B2 = B[bi + 2]; | |||||
| float B3 = B[bi + 3]; | |||||
| float B4 = B[bi + 4]; | |||||
| float B5 = B[bi + 5]; | |||||
| float B6 = B[bi + 6]; | |||||
| float B7 = B[bi + 7]; | |||||
| bi += 8; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||||
| vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); | |||||
| vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); | |||||
| vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); | |||||
| vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| B4 = B[bi + 4]; | |||||
| B5 = B[bi + 5]; | |||||
| B6 = B[bi + 6]; | |||||
| B7 = B[bi + 7]; | |||||
| bi += 8; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||||
| result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); | |||||
| result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); | |||||
| result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); | |||||
| result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); | |||||
| c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); | |||||
| c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); | |||||
| c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl); | |||||
| c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl); | |||||
| c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl); | |||||
| c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c4, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c5, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c6, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c7, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| // -- tails for main pass | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| float B0 = B[bi + 0]; | |||||
| float B1 = B[bi + 1]; | |||||
| float B2 = B[bi + 2]; | |||||
| float B3 = B[bi + 3]; | |||||
| float B4 = B[bi + 4]; | |||||
| float B5 = B[bi + 5]; | |||||
| float B6 = B[bi + 6]; | |||||
| float B7 = B[bi + 7]; | |||||
| bi += 8; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||||
| vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); | |||||
| vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); | |||||
| vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); | |||||
| vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| B4 = B[bi + 4]; | |||||
| B5 = B[bi + 5]; | |||||
| B6 = B[bi + 6]; | |||||
| B7 = B[bi + 7]; | |||||
| bi += 8; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||||
| result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); | |||||
| result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); | |||||
| result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); | |||||
| result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); | |||||
| c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); | |||||
| c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); | |||||
| c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl); | |||||
| c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl); | |||||
| c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl); | |||||
| c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c4, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c5, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c6, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c7, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| float result4 = 0; | |||||
| float result5 = 0; | |||||
| float result6 = 0; | |||||
| float result7 = 0; | |||||
| float result8 = 0; | |||||
| float result9 = 0; | |||||
| float result10 = 0; | |||||
| float result11 = 0; | |||||
| float result12 = 0; | |||||
| float result13 = 0; | |||||
| float result14 = 0; | |||||
| float result15 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| result4 += A[ai + 0] * B[bi + 2]; | |||||
| result5 += A[ai + 1] * B[bi + 2]; | |||||
| result6 += A[ai + 0] * B[bi + 3]; | |||||
| result7 += A[ai + 1] * B[bi + 3]; | |||||
| result8 += A[ai + 0] * B[bi + 4]; | |||||
| result9 += A[ai + 1] * B[bi + 4]; | |||||
| result10 += A[ai + 0] * B[bi + 5]; | |||||
| result11 += A[ai + 1] * B[bi + 5]; | |||||
| result12 += A[ai + 0] * B[bi + 6]; | |||||
| result13 += A[ai + 1] * B[bi + 6]; | |||||
| result14 += A[ai + 0] * B[bi + 7]; | |||||
| result15 += A[ai + 1] * B[bi + 7]; | |||||
| ai += 2; | |||||
| bi += 8; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||||
| C[ci + 2 * ldc + 0] += alpha * result4; | |||||
| C[ci + 2 * ldc + 1] += alpha * result5; | |||||
| C[ci + 3 * ldc + 0] += alpha * result6; | |||||
| C[ci + 3 * ldc + 1] += alpha * result7; | |||||
| C[ci + 4 * ldc + 0] += alpha * result8; | |||||
| C[ci + 4 * ldc + 1] += alpha * result9; | |||||
| C[ci + 5 * ldc + 0] += alpha * result10; | |||||
| C[ci + 5 * ldc + 1] += alpha * result11; | |||||
| C[ci + 6 * ldc + 0] += alpha * result12; | |||||
| C[ci + 6 * ldc + 1] += alpha * result13; | |||||
| C[ci + 7 * ldc + 0] += alpha * result14; | |||||
| C[ci + 7 * ldc + 1] += alpha * result15; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| float result4 = 0; | |||||
| float result5 = 0; | |||||
| float result6 = 0; | |||||
| float result7 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| result2 += A[ai + 0] * B[bi + 2]; | |||||
| result3 += A[ai + 0] * B[bi + 3]; | |||||
| result4 += A[ai + 0] * B[bi + 4]; | |||||
| result5 += A[ai + 0] * B[bi + 5]; | |||||
| result6 += A[ai + 0] * B[bi + 6]; | |||||
| result7 += A[ai + 0] * B[bi + 7]; | |||||
| ai += 1; | |||||
| bi += 8; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||||
| C[ci + 2 * ldc + 0] += alpha * result2; | |||||
| C[ci + 3 * ldc + 0] += alpha * result3; | |||||
| C[ci + 4 * ldc + 0] += alpha * result4; | |||||
| C[ci + 5 * ldc + 0] += alpha * result5; | |||||
| C[ci + 6 * ldc + 0] += alpha * result6; | |||||
| C[ci + 7 * ldc + 0] += alpha * result7; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 8; | |||||
| } | |||||
| // -- tails for N=4 | |||||
| if (N & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| float B0 = B[bi + 0]; | |||||
| float B1 = B[bi + 1]; | |||||
| float B2 = B[bi + 2]; | |||||
| float B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); | |||||
| c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); | |||||
| c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| float B0 = B[bi + 0]; | |||||
| float B1 = B[bi + 1]; | |||||
| float B2 = B[bi + 2]; | |||||
| float B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); | |||||
| c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); | |||||
| c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| float result4 = 0; | |||||
| float result5 = 0; | |||||
| float result6 = 0; | |||||
| float result7 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| result4 += A[ai + 0] * B[bi + 2]; | |||||
| result5 += A[ai + 1] * B[bi + 2]; | |||||
| result6 += A[ai + 0] * B[bi + 3]; | |||||
| result7 += A[ai + 1] * B[bi + 3]; | |||||
| ai += 2; | |||||
| bi += 4; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||||
| C[ci + 2 * ldc + 0] += alpha * result4; | |||||
| C[ci + 2 * ldc + 1] += alpha * result5; | |||||
| C[ci + 3 * ldc + 0] += alpha * result6; | |||||
| C[ci + 3 * ldc + 1] += alpha * result7; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| result2 += A[ai + 0] * B[bi + 2]; | |||||
| result3 += A[ai + 0] * B[bi + 3]; | |||||
| ai += 1; | |||||
| bi += 4; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||||
| C[ci + 2 * ldc + 0] += alpha * result2; | |||||
| C[ci + 3 * ldc + 0] += alpha * result3; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 4; | |||||
| } | |||||
| // -- tails for N=2 | |||||
| if (N & 2) { | |||||
| gvl = __riscv_vsetvl_e32m2(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| float B0 = B[bi + 0]; | |||||
| float B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| float B0 = B[bi + 0]; | |||||
| float B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| ai += 2; | |||||
| bi += 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| ai += 1; | |||||
| bi += 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 2; | |||||
| } | |||||
| // -- tails for N=1 | |||||
| if (N & 1) { | |||||
| gvl = __riscv_vsetvl_e32m2(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| float B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| float B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| ai += 2; | |||||
| bi += 1; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| ai += 1; | |||||
| bi += 1; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 1; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,991 @@ | |||||
| /* | |||||
| AUTOGENERATED KERNEL | |||||
| Script: ./kernel/riscv64/generate_kernel.py | |||||
| Settings: | |||||
| LMUL=2 | |||||
| M=8 | |||||
| M_tail_scalar_from=2 | |||||
| N=8 | |||||
| __riscv_='__riscv_' | |||||
| complex=False | |||||
| conjugate=False | |||||
| cpu='zvl128b' | |||||
| force_acc_double=False | |||||
| index_type='BLASLONG' | |||||
| op='trmm' | |||||
| param_precision='float' | |||||
| reg_width_bits=128 | |||||
| tail_policy='' | |||||
| trace=False | |||||
| Derived: | |||||
| ELEN_ACC=32 | |||||
| ELEN_PARAM=32 | |||||
| LMUL_ACC=2 | |||||
| VFMACC='__riscv_vfmacc_vf_f32m2' | |||||
| VFMUL='__riscv_vfmul_vf_f32m2' | |||||
| VLEV='__riscv_vle32_v_f32m2' | |||||
| VLSEV='__riscv_vlse32_v_f32m2' | |||||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' | |||||
| VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' | |||||
| VSETVL='__riscv_vsetvl_e32m2' | |||||
| VSEV='__riscv_vse32_v_f32m2' | |||||
| VSSEV='__riscv_vsse32_v_f32m2' | |||||
| acc_vector_t='vfloat32m2_t' | |||||
| output='strmm_kernel_8x8_zvl128b.c' | |||||
| param_scalar_t='float' | |||||
| param_vector_t='vfloat32m2_t' | |||||
| */ | |||||
| #include "common.h" | |||||
| #if defined(LEFT) != defined(TRANSA) | |||||
| #define BACKWARDS | |||||
| #endif | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) | |||||
| { | |||||
| BLASLONG gvl = 0; | |||||
| BLASLONG m_top = 0; | |||||
| BLASLONG n_top = 0; | |||||
| // -- MAIN PASS | |||||
| for (BLASLONG j = 0; j < N / 8; j += 1) { | |||||
| m_top = 0; | |||||
| BLASLONG gvl = __riscv_vsetvl_e32m2(8); | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 8; | |||||
| bi += off * 8; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 8; | |||||
| #else | |||||
| pass_K = off + 8; | |||||
| #endif | |||||
| #endif | |||||
| float B0 = B[bi + 0]; | |||||
| float B1 = B[bi + 1]; | |||||
| float B2 = B[bi + 2]; | |||||
| float B3 = B[bi + 3]; | |||||
| float B4 = B[bi + 4]; | |||||
| float B5 = B[bi + 5]; | |||||
| float B6 = B[bi + 6]; | |||||
| float B7 = B[bi + 7]; | |||||
| bi += 8; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||||
| vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); | |||||
| vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); | |||||
| vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); | |||||
| vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| B4 = B[bi + 4]; | |||||
| B5 = B[bi + 5]; | |||||
| B6 = B[bi + 6]; | |||||
| B7 = B[bi + 7]; | |||||
| bi += 8; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||||
| result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); | |||||
| result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); | |||||
| result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); | |||||
| result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||||
| vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); | |||||
| vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); | |||||
| vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); | |||||
| vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl); | |||||
| vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl); | |||||
| vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl); | |||||
| vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl); | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c4, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c5, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c6, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c7, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| // -- tails for main pass | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4; | |||||
| bi += off * 8; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 8; | |||||
| #endif | |||||
| #endif | |||||
| float B0 = B[bi + 0]; | |||||
| float B1 = B[bi + 1]; | |||||
| float B2 = B[bi + 2]; | |||||
| float B3 = B[bi + 3]; | |||||
| float B4 = B[bi + 4]; | |||||
| float B5 = B[bi + 5]; | |||||
| float B6 = B[bi + 6]; | |||||
| float B7 = B[bi + 7]; | |||||
| bi += 8; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||||
| vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); | |||||
| vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); | |||||
| vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); | |||||
| vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| B4 = B[bi + 4]; | |||||
| B5 = B[bi + 5]; | |||||
| B6 = B[bi + 6]; | |||||
| B7 = B[bi + 7]; | |||||
| bi += 8; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||||
| result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); | |||||
| result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); | |||||
| result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); | |||||
| result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||||
| vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); | |||||
| vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); | |||||
| vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); | |||||
| vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl); | |||||
| vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl); | |||||
| vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl); | |||||
| vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl); | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c4, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c5, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c6, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c7, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| float result4 = 0; | |||||
| float result5 = 0; | |||||
| float result6 = 0; | |||||
| float result7 = 0; | |||||
| float result8 = 0; | |||||
| float result9 = 0; | |||||
| float result10 = 0; | |||||
| float result11 = 0; | |||||
| float result12 = 0; | |||||
| float result13 = 0; | |||||
| float result14 = 0; | |||||
| float result15 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2; | |||||
| bi += off * 8; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 8; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| result4 += A[ai + 0] * B[bi + 2]; | |||||
| result5 += A[ai + 1] * B[bi + 2]; | |||||
| result6 += A[ai + 0] * B[bi + 3]; | |||||
| result7 += A[ai + 1] * B[bi + 3]; | |||||
| result8 += A[ai + 0] * B[bi + 4]; | |||||
| result9 += A[ai + 1] * B[bi + 4]; | |||||
| result10 += A[ai + 0] * B[bi + 5]; | |||||
| result11 += A[ai + 1] * B[bi + 5]; | |||||
| result12 += A[ai + 0] * B[bi + 6]; | |||||
| result13 += A[ai + 1] * B[bi + 6]; | |||||
| result14 += A[ai + 0] * B[bi + 7]; | |||||
| result15 += A[ai + 1] * B[bi + 7]; | |||||
| ai += 2; | |||||
| bi += 8; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||||
| C[ci + 2 * ldc + 0] = alpha * result4; | |||||
| C[ci + 2 * ldc + 1] = alpha * result5; | |||||
| C[ci + 3 * ldc + 0] = alpha * result6; | |||||
| C[ci + 3 * ldc + 1] = alpha * result7; | |||||
| C[ci + 4 * ldc + 0] = alpha * result8; | |||||
| C[ci + 4 * ldc + 1] = alpha * result9; | |||||
| C[ci + 5 * ldc + 0] = alpha * result10; | |||||
| C[ci + 5 * ldc + 1] = alpha * result11; | |||||
| C[ci + 6 * ldc + 0] = alpha * result12; | |||||
| C[ci + 6 * ldc + 1] = alpha * result13; | |||||
| C[ci + 7 * ldc + 0] = alpha * result14; | |||||
| C[ci + 7 * ldc + 1] = alpha * result15; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| float result4 = 0; | |||||
| float result5 = 0; | |||||
| float result6 = 0; | |||||
| float result7 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1; | |||||
| bi += off * 8; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 8; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| result2 += A[ai + 0] * B[bi + 2]; | |||||
| result3 += A[ai + 0] * B[bi + 3]; | |||||
| result4 += A[ai + 0] * B[bi + 4]; | |||||
| result5 += A[ai + 0] * B[bi + 5]; | |||||
| result6 += A[ai + 0] * B[bi + 6]; | |||||
| result7 += A[ai + 0] * B[bi + 7]; | |||||
| ai += 1; | |||||
| bi += 8; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||||
| C[ci + 2 * ldc + 0] = alpha * result2; | |||||
| C[ci + 3 * ldc + 0] = alpha * result3; | |||||
| C[ci + 4 * ldc + 0] = alpha * result4; | |||||
| C[ci + 5 * ldc + 0] = alpha * result5; | |||||
| C[ci + 6 * ldc + 0] = alpha * result6; | |||||
| C[ci + 7 * ldc + 0] = alpha * result7; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 8; | |||||
| } | |||||
| // -- tails for N=4 | |||||
| if (N & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 8; | |||||
| bi += off * 4; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 8; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| float B0 = B[bi + 0]; | |||||
| float B1 = B[bi + 1]; | |||||
| float B2 = B[bi + 2]; | |||||
| float B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||||
| vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); | |||||
| vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); | |||||
| vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4; | |||||
| bi += off * 4; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| float B0 = B[bi + 0]; | |||||
| float B1 = B[bi + 1]; | |||||
| float B2 = B[bi + 2]; | |||||
| float B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||||
| vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); | |||||
| vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); | |||||
| vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| float result4 = 0; | |||||
| float result5 = 0; | |||||
| float result6 = 0; | |||||
| float result7 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2; | |||||
| bi += off * 4; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| result4 += A[ai + 0] * B[bi + 2]; | |||||
| result5 += A[ai + 1] * B[bi + 2]; | |||||
| result6 += A[ai + 0] * B[bi + 3]; | |||||
| result7 += A[ai + 1] * B[bi + 3]; | |||||
| ai += 2; | |||||
| bi += 4; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||||
| C[ci + 2 * ldc + 0] = alpha * result4; | |||||
| C[ci + 2 * ldc + 1] = alpha * result5; | |||||
| C[ci + 3 * ldc + 0] = alpha * result6; | |||||
| C[ci + 3 * ldc + 1] = alpha * result7; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1; | |||||
| bi += off * 4; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| result2 += A[ai + 0] * B[bi + 2]; | |||||
| result3 += A[ai + 0] * B[bi + 3]; | |||||
| ai += 1; | |||||
| bi += 4; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||||
| C[ci + 2 * ldc + 0] = alpha * result2; | |||||
| C[ci + 3 * ldc + 0] = alpha * result3; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 4; | |||||
| } | |||||
| // -- tails for N=2 | |||||
| if (N & 2) { | |||||
| gvl = __riscv_vsetvl_e32m2(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 8; | |||||
| bi += off * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 8; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| float B0 = B[bi + 0]; | |||||
| float B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||||
| vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4; | |||||
| bi += off * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| float B0 = B[bi + 0]; | |||||
| float B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||||
| vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2; | |||||
| bi += off * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| ai += 2; | |||||
| bi += 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1; | |||||
| bi += off * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| ai += 1; | |||||
| bi += 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 2; | |||||
| } | |||||
| // -- tails for N=1 | |||||
| if (N & 1) { | |||||
| gvl = __riscv_vsetvl_e32m2(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 8; | |||||
| bi += off * 1; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 8; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| float B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4; | |||||
| bi += off * 1; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| float B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2; | |||||
| bi += off * 1; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| ai += 2; | |||||
| bi += 1; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1; | |||||
| bi += off * 1; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| ai += 1; | |||||
| bi += 1; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 1; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,720 @@ | |||||
| /* | |||||
| AUTOGENERATED KERNEL | |||||
| Script: ./kernel/riscv64/generate_kernel.py | |||||
| Settings: | |||||
| LMUL=2 | |||||
| M=4 | |||||
| M_tail_scalar_from=2 | |||||
| N=4 | |||||
| __riscv_='__riscv_' | |||||
| complex=True | |||||
| conjugate=False | |||||
| cpu='zvl128b' | |||||
| force_acc_double=False | |||||
| index_type='BLASLONG' | |||||
| op='gemm' | |||||
| param_precision='double' | |||||
| reg_width_bits=128 | |||||
| tail_policy='' | |||||
| trace=False | |||||
| Derived: | |||||
| ELEN_ACC=64 | |||||
| ELEN_PARAM=64 | |||||
| LMUL_ACC=2 | |||||
| VFMACC='__riscv_vfmacc_vf_f64m2' | |||||
| VFMUL='__riscv_vfmul_vf_f64m2' | |||||
| VLEV='__riscv_vle64_v_f64m2' | |||||
| VLSEV='__riscv_vlse64_v_f64m2' | |||||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' | |||||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' | |||||
| VSETVL='__riscv_vsetvl_e64m2' | |||||
| VSEV='__riscv_vse64_v_f64m2' | |||||
| VSSEV='__riscv_vsse64_v_f64m2' | |||||
| acc_vector_t='vfloat64m2_t' | |||||
| output='zgemm_kernel_4x4_zvl128b.c' | |||||
| param_scalar_t='double' | |||||
| param_vector_t='vfloat64m2_t' | |||||
| */ | |||||
| #include "common.h" | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| #define S0 1 | |||||
| #define S1 -1 | |||||
| #define S2 1 | |||||
| #define S3 1 | |||||
| #define VFMACC_RR __riscv_vfmsac | |||||
| #define VFMACC_RI __riscv_vfmacc | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| #define S0 1 | |||||
| #define S1 1 | |||||
| #define S2 1 | |||||
| #define S3 -1 | |||||
| #define VFMACC_RR __riscv_vfmacc | |||||
| #define VFMACC_RI __riscv_vfmsac | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| #define S0 1 | |||||
| #define S1 1 | |||||
| #define S2 -1 | |||||
| #define S3 1 | |||||
| #define VFMACC_RR __riscv_vfmacc | |||||
| #define VFMACC_RI __riscv_vfnmsac | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| #define S0 1 | |||||
| #define S1 -1 | |||||
| #define S2 -1 | |||||
| #define S3 -1 | |||||
| #define VFMACC_RR __riscv_vfmsac | |||||
| #define VFMACC_RI __riscv_vfnmacc | |||||
| #endif | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) | |||||
| { | |||||
| BLASLONG gvl = 0; | |||||
| BLASLONG m_top = 0; | |||||
| BLASLONG n_top = 0; | |||||
| // -- MAIN PASS | |||||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||||
| m_top = 0; | |||||
| BLASLONG gvl = __riscv_vsetvl_e64m2(4); | |||||
| for (BLASLONG i = 0; i < M / 4; i += 1) { | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| double B0r = B[bi + 0 * 2 + 0]; | |||||
| double B0i = B[bi + 0 * 2 + 1]; | |||||
| double B1r = B[bi + 1 * 2 + 0]; | |||||
| double B1i = B[bi + 1 * 2 + 1]; | |||||
| double B2r = B[bi + 2 * 2 + 0]; | |||||
| double B2i = B[bi + 2 * 2 + 1]; | |||||
| double B3r = B[bi + 3 * 2 + 0]; | |||||
| double B3i = B[bi + 3 * 2 + 1]; | |||||
| bi += 4 * 2; | |||||
| vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k | |||||
| // leaving 6 vector registers for temporaries | |||||
| // performing 2 operations between reuses of temporaries | |||||
| vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); | |||||
| vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); | |||||
| vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); | |||||
| vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| vfloat64m2_t ACC0r = tmp0r; | |||||
| vfloat64m2_t ACC0i = tmp0i; | |||||
| vfloat64m2_t ACC1r = tmp1r; | |||||
| vfloat64m2_t ACC1i = tmp1i; | |||||
| tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||||
| vfloat64m2_t ACC2r = tmp0r; | |||||
| vfloat64m2_t ACC2i = tmp0i; | |||||
| vfloat64m2_t ACC3r = tmp1r; | |||||
| vfloat64m2_t ACC3i = tmp1i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| B1r = B[bi + 1 * 2 + 0]; | |||||
| B1i = B[bi + 1 * 2 + 1]; | |||||
| B2r = B[bi + 2 * 2 + 0]; | |||||
| B2i = B[bi + 2 * 2 + 1]; | |||||
| B3r = B[bi + 3 * 2 + 0]; | |||||
| B3i = B[bi + 3 * 2 + 1]; | |||||
| bi += 4 * 2; | |||||
| A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||||
| tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||||
| ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); | |||||
| ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); | |||||
| ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); | |||||
| ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m2_t C2r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t C2i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m2_t C3r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t C3i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||||
| C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); | |||||
| C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); | |||||
| C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); | |||||
| C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||||
| C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); | |||||
| C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); | |||||
| C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); | |||||
| C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| // -- tails for main pass | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| double result8 = 0; | |||||
| double result9 = 0; | |||||
| double result10 = 0; | |||||
| double result11 = 0; | |||||
| double result12 = 0; | |||||
| double result13 = 0; | |||||
| double result14 = 0; | |||||
| double result15 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; | |||||
| result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; | |||||
| result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; | |||||
| result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; | |||||
| result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; | |||||
| result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; | |||||
| result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; | |||||
| result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; | |||||
| result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; | |||||
| result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; | |||||
| ai += 2 * 2; | |||||
| bi += 4 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| double Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||||
| Cr += result4 * alphar; | |||||
| Ci += result5 * alphar; | |||||
| Cr -= result5 * alphai; | |||||
| Ci += result4 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; | |||||
| Cr += result6 * alphar; | |||||
| Ci += result7 * alphar; | |||||
| Cr -= result7 * alphai; | |||||
| Ci += result6 * alphai; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; | |||||
| Cr += result8 * alphar; | |||||
| Ci += result9 * alphar; | |||||
| Cr -= result9 * alphai; | |||||
| Ci += result8 * alphai; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; | |||||
| Cr += result10 * alphar; | |||||
| Ci += result11 * alphar; | |||||
| Cr -= result11 * alphai; | |||||
| Ci += result10 * alphai; | |||||
| C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; | |||||
| Cr += result12 * alphar; | |||||
| Ci += result13 * alphar; | |||||
| Cr -= result13 * alphai; | |||||
| Ci += result12 * alphai; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; | |||||
| Cr += result14 * alphar; | |||||
| Ci += result15 * alphar; | |||||
| Cr -= result15 * alphai; | |||||
| Ci += result14 * alphai; | |||||
| C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; | |||||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; | |||||
| result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; | |||||
| result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; | |||||
| ai += 1 * 2; | |||||
| bi += 4 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| double Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; | |||||
| Cr += result4 * alphar; | |||||
| Ci += result5 * alphar; | |||||
| Cr -= result5 * alphai; | |||||
| Ci += result4 * alphai; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; | |||||
| Cr += result6 * alphar; | |||||
| Ci += result7 * alphar; | |||||
| Cr -= result7 * alphai; | |||||
| Ci += result6 * alphai; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 4; | |||||
| } | |||||
| // -- tails for N=2 | |||||
| if (N & 2) { | |||||
| gvl = __riscv_vsetvl_e64m2(4); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 4; i += 1) { | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| double B0r = B[bi + 0 * 2 + 0]; | |||||
| double B0i = B[bi + 0 * 2 + 1]; | |||||
| double B1r = B[bi + 1 * 2 + 0]; | |||||
| double B1i = B[bi + 1 * 2 + 1]; | |||||
| bi += 2 * 2; | |||||
| vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k | |||||
| // leaving 10 vector registers for temporaries | |||||
| vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); | |||||
| vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); | |||||
| vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); | |||||
| vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| vfloat64m2_t ACC0r = tmp0r; | |||||
| vfloat64m2_t ACC0i = tmp0i; | |||||
| vfloat64m2_t ACC1r = tmp1r; | |||||
| vfloat64m2_t ACC1i = tmp1i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| B1r = B[bi + 1 * 2 + 0]; | |||||
| B1i = B[bi + 1 * 2 + 1]; | |||||
| bi += 2 * 2; | |||||
| A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; | |||||
| result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; | |||||
| ai += 2 * 2; | |||||
| bi += 2 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| double Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||||
| Cr += result4 * alphar; | |||||
| Ci += result5 * alphar; | |||||
| Cr -= result5 * alphai; | |||||
| Ci += result4 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; | |||||
| Cr += result6 * alphar; | |||||
| Ci += result7 * alphar; | |||||
| Cr -= result7 * alphai; | |||||
| Ci += result6 * alphai; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| ai += 1 * 2; | |||||
| bi += 2 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| double Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 2; | |||||
| } | |||||
| // -- tails for N=1 | |||||
| if (N & 1) { | |||||
| gvl = __riscv_vsetvl_e64m2(4); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 4; i += 1) { | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| double B0r = B[bi + 0 * 2 + 0]; | |||||
| double B0i = B[bi + 0 * 2 + 1]; | |||||
| bi += 1 * 2; | |||||
| vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k | |||||
| // leaving 12 vector registers for temporaries | |||||
| vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); | |||||
| vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| vfloat64m2_t ACC0r = tmp0r; | |||||
| vfloat64m2_t ACC0i = tmp0i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| bi += 1 * 2; | |||||
| A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||||
| ai += 2 * 2; | |||||
| bi += 1 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| double Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| ai += 1 * 2; | |||||
| bi += 1 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| double Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 1; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,805 @@ | |||||
| /* | |||||
| AUTOGENERATED KERNEL | |||||
| Script: ./kernel/riscv64/generate_kernel.py | |||||
| Settings: | |||||
| LMUL=2 | |||||
| M=4 | |||||
| M_tail_scalar_from=2 | |||||
| N=4 | |||||
| __riscv_='__riscv_' | |||||
| complex=True | |||||
| conjugate=False | |||||
| cpu='zvl128b' | |||||
| force_acc_double=False | |||||
| index_type='BLASLONG' | |||||
| op='trmm' | |||||
| param_precision='double' | |||||
| reg_width_bits=128 | |||||
| tail_policy='' | |||||
| trace=False | |||||
| Derived: | |||||
| ELEN_ACC=64 | |||||
| ELEN_PARAM=64 | |||||
| LMUL_ACC=2 | |||||
| VFMACC='__riscv_vfmacc_vf_f64m2' | |||||
| VFMUL='__riscv_vfmul_vf_f64m2' | |||||
| VLEV='__riscv_vle64_v_f64m2' | |||||
| VLSEV='__riscv_vlse64_v_f64m2' | |||||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' | |||||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' | |||||
| VSETVL='__riscv_vsetvl_e64m2' | |||||
| VSEV='__riscv_vse64_v_f64m2' | |||||
| VSSEV='__riscv_vsse64_v_f64m2' | |||||
| acc_vector_t='vfloat64m2_t' | |||||
| output='ztrmm_kernel_4x4_zvl128b.c' | |||||
| param_scalar_t='double' | |||||
| param_vector_t='vfloat64m2_t' | |||||
| */ | |||||
| #include "common.h" | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| #define S0 1 | |||||
| #define S1 -1 | |||||
| #define S2 1 | |||||
| #define S3 1 | |||||
| #define VFMACC_RR __riscv_vfmsac | |||||
| #define VFMACC_RI __riscv_vfmacc | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| #define S0 1 | |||||
| #define S1 1 | |||||
| #define S2 1 | |||||
| #define S3 -1 | |||||
| #define VFMACC_RR __riscv_vfmacc | |||||
| #define VFMACC_RI __riscv_vfmsac | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| #define S0 1 | |||||
| #define S1 1 | |||||
| #define S2 -1 | |||||
| #define S3 1 | |||||
| #define VFMACC_RR __riscv_vfmacc | |||||
| #define VFMACC_RI __riscv_vfnmsac | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| #define S0 1 | |||||
| #define S1 -1 | |||||
| #define S2 -1 | |||||
| #define S3 -1 | |||||
| #define VFMACC_RR __riscv_vfmsac | |||||
| #define VFMACC_RI __riscv_vfnmacc | |||||
| #endif | |||||
| #if defined(LEFT) != defined(TRANSA) | |||||
| #define BACKWARDS | |||||
| #endif | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) | |||||
| { | |||||
| BLASLONG gvl = 0; | |||||
| BLASLONG m_top = 0; | |||||
| BLASLONG n_top = 0; | |||||
| // -- MAIN PASS | |||||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||||
| m_top = 0; | |||||
| BLASLONG gvl = __riscv_vsetvl_e64m2(4); | |||||
| for (BLASLONG i = 0; i < M / 4; i += 1) { | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4 * 2; | |||||
| bi += off * 4 * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| double B0r = B[bi + 0 * 2 + 0]; | |||||
| double B0i = B[bi + 0 * 2 + 1]; | |||||
| double B1r = B[bi + 1 * 2 + 0]; | |||||
| double B1i = B[bi + 1 * 2 + 1]; | |||||
| double B2r = B[bi + 2 * 2 + 0]; | |||||
| double B2i = B[bi + 2 * 2 + 1]; | |||||
| double B3r = B[bi + 3 * 2 + 0]; | |||||
| double B3i = B[bi + 3 * 2 + 1]; | |||||
| bi += 4 * 2; | |||||
| vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k | |||||
| // leaving 6 vector registers for temporaries | |||||
| // performing 2 operations between reuses of temporaries | |||||
| vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); | |||||
| vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); | |||||
| vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); | |||||
| vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| vfloat64m2_t ACC0r = tmp0r; | |||||
| vfloat64m2_t ACC0i = tmp0i; | |||||
| vfloat64m2_t ACC1r = tmp1r; | |||||
| vfloat64m2_t ACC1i = tmp1i; | |||||
| tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||||
| vfloat64m2_t ACC2r = tmp0r; | |||||
| vfloat64m2_t ACC2i = tmp0i; | |||||
| vfloat64m2_t ACC3r = tmp1r; | |||||
| vfloat64m2_t ACC3i = tmp1i; | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| B1r = B[bi + 1 * 2 + 0]; | |||||
| B1i = B[bi + 1 * 2 + 1]; | |||||
| B2r = B[bi + 2 * 2 + 0]; | |||||
| B2i = B[bi + 2 * 2 + 1]; | |||||
| B3r = B[bi + 3 * 2 + 0]; | |||||
| B3i = B[bi + 3 * 2 + 1]; | |||||
| bi += 4 * 2; | |||||
| A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||||
| tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||||
| ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); | |||||
| ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); | |||||
| ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); | |||||
| ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); | |||||
| vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); | |||||
| vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); | |||||
| vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); | |||||
| vfloat64m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); | |||||
| vfloat64m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); | |||||
| vfloat64m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); | |||||
| vfloat64m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||||
| C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); | |||||
| C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); | |||||
| C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); | |||||
| C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| // -- tails for main pass | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| double result8 = 0; | |||||
| double result9 = 0; | |||||
| double result10 = 0; | |||||
| double result11 = 0; | |||||
| double result12 = 0; | |||||
| double result13 = 0; | |||||
| double result14 = 0; | |||||
| double result15 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2 * 2; | |||||
| bi += off * 4 * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; | |||||
| result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; | |||||
| result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; | |||||
| result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; | |||||
| result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; | |||||
| result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; | |||||
| result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; | |||||
| result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; | |||||
| result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; | |||||
| result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; | |||||
| ai += 2 * 2; | |||||
| bi += 4 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| double Cr, Ci; | |||||
| Cr = result0 * alphar; | |||||
| Ci = result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = result2 * alphar; | |||||
| Ci = result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = result4 * alphar; | |||||
| Ci = result5 * alphar; | |||||
| Cr -= result5 * alphai; | |||||
| Ci += result4 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = result6 * alphar; | |||||
| Ci = result7 * alphar; | |||||
| Cr -= result7 * alphai; | |||||
| Ci += result6 * alphai; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = result8 * alphar; | |||||
| Ci = result9 * alphar; | |||||
| Cr -= result9 * alphai; | |||||
| Ci += result8 * alphai; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = result10 * alphar; | |||||
| Ci = result11 * alphar; | |||||
| Cr -= result11 * alphai; | |||||
| Ci += result10 * alphai; | |||||
| C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = result12 * alphar; | |||||
| Ci = result13 * alphar; | |||||
| Cr -= result13 * alphai; | |||||
| Ci += result12 * alphai; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = result14 * alphar; | |||||
| Ci = result15 * alphar; | |||||
| Cr -= result15 * alphai; | |||||
| Ci += result14 * alphai; | |||||
| C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1 * 2; | |||||
| bi += off * 4 * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; | |||||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; | |||||
| result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; | |||||
| result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; | |||||
| ai += 1 * 2; | |||||
| bi += 4 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| double Cr, Ci; | |||||
| Cr = result0 * alphar; | |||||
| Ci = result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = result2 * alphar; | |||||
| Ci = result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = result4 * alphar; | |||||
| Ci = result5 * alphar; | |||||
| Cr -= result5 * alphai; | |||||
| Ci += result4 * alphai; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = result6 * alphar; | |||||
| Ci = result7 * alphar; | |||||
| Cr -= result7 * alphai; | |||||
| Ci += result6 * alphai; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 4; | |||||
| } | |||||
| // -- tails for N=2 | |||||
| if (N & 2) { | |||||
| gvl = __riscv_vsetvl_e64m2(4); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 4; i += 1) { | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4 * 2; | |||||
| bi += off * 2 * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| double B0r = B[bi + 0 * 2 + 0]; | |||||
| double B0i = B[bi + 0 * 2 + 1]; | |||||
| double B1r = B[bi + 1 * 2 + 0]; | |||||
| double B1i = B[bi + 1 * 2 + 1]; | |||||
| bi += 2 * 2; | |||||
| vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k | |||||
| // leaving 10 vector registers for temporaries | |||||
| vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); | |||||
| vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); | |||||
| vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); | |||||
| vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| vfloat64m2_t ACC0r = tmp0r; | |||||
| vfloat64m2_t ACC0i = tmp0i; | |||||
| vfloat64m2_t ACC1r = tmp1r; | |||||
| vfloat64m2_t ACC1i = tmp1i; | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| B1r = B[bi + 1 * 2 + 0]; | |||||
| B1i = B[bi + 1 * 2 + 1]; | |||||
| bi += 2 * 2; | |||||
| A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); | |||||
| vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); | |||||
| vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); | |||||
| vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2 * 2; | |||||
| bi += off * 2 * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; | |||||
| result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; | |||||
| ai += 2 * 2; | |||||
| bi += 2 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| double Cr, Ci; | |||||
| Cr = result0 * alphar; | |||||
| Ci = result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = result2 * alphar; | |||||
| Ci = result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = result4 * alphar; | |||||
| Ci = result5 * alphar; | |||||
| Cr -= result5 * alphai; | |||||
| Ci += result4 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = result6 * alphar; | |||||
| Ci = result7 * alphar; | |||||
| Cr -= result7 * alphai; | |||||
| Ci += result6 * alphai; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1 * 2; | |||||
| bi += off * 2 * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| ai += 1 * 2; | |||||
| bi += 2 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| double Cr, Ci; | |||||
| Cr = result0 * alphar; | |||||
| Ci = result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = result2 * alphar; | |||||
| Ci = result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 2; | |||||
| } | |||||
| // -- tails for N=1 | |||||
| if (N & 1) { | |||||
| gvl = __riscv_vsetvl_e64m2(4); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 4; i += 1) { | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4 * 2; | |||||
| bi += off * 1 * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| double B0r = B[bi + 0 * 2 + 0]; | |||||
| double B0i = B[bi + 0 * 2 + 1]; | |||||
| bi += 1 * 2; | |||||
| vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k | |||||
| // leaving 12 vector registers for temporaries | |||||
| vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); | |||||
| vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| vfloat64m2_t ACC0r = tmp0r; | |||||
| vfloat64m2_t ACC0i = tmp0i; | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| bi += 1 * 2; | |||||
| A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); | |||||
| vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2 * 2; | |||||
| bi += off * 1 * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||||
| ai += 2 * 2; | |||||
| bi += 1 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| double Cr, Ci; | |||||
| Cr = result0 * alphar; | |||||
| Ci = result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = result2 * alphar; | |||||
| Ci = result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1 * 2; | |||||
| bi += off * 1 * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| ai += 1 * 2; | |||||
| bi += 1 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| double Cr, Ci; | |||||
| Cr = result0 * alphar; | |||||
| Ci = result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 1; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -3123,6 +3123,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #ifdef RISCV64_ZVL128B | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define SGEMM_DEFAULT_P 128 | |||||
| #define DGEMM_DEFAULT_P 128 | |||||
| #define CGEMM_DEFAULT_P 96 | |||||
| #define ZGEMM_DEFAULT_P 64 | |||||
| #define SGEMM_DEFAULT_Q 240 | |||||
| #define DGEMM_DEFAULT_Q 120 | |||||
| #define CGEMM_DEFAULT_Q 120 | |||||
| #define ZGEMM_DEFAULT_Q 120 | |||||
| #define SGEMM_DEFAULT_R 12288 | |||||
| #define DGEMM_DEFAULT_R 8192 | |||||
| #define CGEMM_DEFAULT_R 4096 | |||||
| #define ZGEMM_DEFAULT_R 4096 | |||||
| #define SYMV_P 16 | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||||
| #endif | |||||
| #ifdef RISCV64_ZVL256B | #ifdef RISCV64_ZVL256B | ||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||