Loongarch64 cgemv zgemv opttags/v0.3.27
| @@ -100,6 +100,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CGEMVNKERNEL = cgemv_n_4_lsx.S | |||
| CGEMVTKERNEL = cgemv_t_4_lsx.S | |||
| CGEMMKERNEL = cgemm_kernel_8x4_lsx.S | |||
| CGEMMINCOPY = cgemm_ncopy_8_lsx.S | |||
| CGEMMITCOPY = cgemm_tcopy_8_lsx.S | |||
| @@ -115,6 +118,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZGEMVNKERNEL = zgemv_n_2_lsx.S | |||
| ZGEMVTKERNEL = zgemv_t_2_lsx.S | |||
| ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S | |||
| ZGEMMONCOPY = zgemm_ncopy_4_lsx.S | |||
| ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S | |||
| @@ -0,0 +1,323 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define ALPHA_R $f0 | |||
| #define ALPHA_I $f1 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INC_X $r10 | |||
| #define Y $r11 | |||
| #define INC_Y $r6 | |||
| #define J $r12 | |||
| #define I $r13 | |||
| #define K $r14 | |||
| #define Y_ORG $r15 | |||
| #define OFFSET $r16 | |||
| #define K_LDA $r17 | |||
| #define M8 $r18 | |||
| #define T0 $r19 | |||
| #define PA0 $r20 | |||
| #define PA1 $r23 | |||
| #define PA2 $r24 | |||
| #define PA3 $r25 | |||
| #define PA4 $r26 | |||
| #define PA5 $r27 | |||
| #define PA6 $r28 | |||
| #define PA7 $r29 | |||
| #define VALPHA $vr1 | |||
| #define X0 $vr2 | |||
| #define X1 $vr3 | |||
| #define X2 $vr4 | |||
| #define X3 $vr5 | |||
| #define X4 $vr6 | |||
| #define X5 $vr7 | |||
| #define X6 $vr8 | |||
| #define X7 $vr9 | |||
| #define Y0 $vr10 | |||
| #define Y1 $vr11 | |||
| #define A0 $vr12 | |||
| #define A1 $vr13 | |||
| #define A2 $vr14 | |||
| #define A3 $vr15 | |||
| #define A4 $vr16 | |||
| #define A5 $vr17 | |||
| #define A6 $vr18 | |||
| #define A7 $vr19 | |||
| #define A8 $vr20 | |||
| #define A9 $vr21 | |||
| #define A10 $vr22 | |||
| #define A11 $vr23 | |||
| #define A12 $vr24 | |||
| #define A13 $vr25 | |||
| #define A14 $vr26 | |||
| #define A15 $vr27 | |||
| #define TMP0 $vr28 | |||
| #define TMP1 $vr29 | |||
| #define TMP2 $vr30 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| #define GXCONJ 0 | |||
| #define GCONJ 0 | |||
| #else | |||
| #define GXCONJ 1 | |||
| #define GCONJ 0 | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| #define GXCONJ 0 | |||
| #define GCONJ 1 | |||
| #else | |||
| #define GXCONJ 1 | |||
| #define GCONJ 1 | |||
| #endif | |||
| #endif | |||
| .macro CLOAD_X_4 | |||
| GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||
| X3, VALPHA, X3, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CLOAD_X_4_GAP | |||
| vldrepl.d X0, X, 0x00 | |||
| PTR_ADD T0, X, INC_X | |||
| vldrepl.d X1, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| vldrepl.d X2, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| vldrepl.d X3, T0, 0x00 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||
| X3, VALPHA, X3, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CLOAD_X_1 | |||
| GLDREPL v, d, X0, X, 0x00 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CLOAD_Y_4 | |||
| GLD v, , Y0, Y, 0, Y1, Y, 0x10 | |||
| .endm | |||
| .macro CLOAD_Y_4_GAP | |||
| fld.d $f10, Y, 0 | |||
| fldx.d $f13, Y, INC_Y | |||
| PTR_ALSL T0, INC_Y, Y, 1 | |||
| fld.d $f11, T0, 0 | |||
| fldx.d $f17, T0, INC_Y | |||
| vpackev.d Y0, A1, Y0 | |||
| vpackev.d Y1, A5, Y1 | |||
| .endm | |||
| .macro CLOAD_Y_1 | |||
| fld.d $f10, Y, 0 | |||
| .endm | |||
| .macro CSTORE_Y_4 | |||
| GST v, , Y0, Y, 0, Y1, Y, 0x10 | |||
| .endm | |||
| .macro CSTORE_Y_4_GAP | |||
| vstelm.d Y0, Y, 0, 0 | |||
| PTR_ADD T0, Y, INC_Y | |||
| vstelm.d Y0, T0, 0, 1 | |||
| PTR_ADD T0, T0, INC_Y | |||
| vstelm.d Y1, T0, 0, 0 | |||
| PTR_ADD T0, T0, INC_Y | |||
| vstelm.d Y1, T0, 0, 1 | |||
| .endm | |||
| .macro CSTORE_Y_1 | |||
| fst.d $f10, Y, 0 | |||
| .endm | |||
| .macro CGEMV_N_4x4 | |||
| GLD_INC v, , 0x10, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0, \ | |||
| A4, PA2, 0, A5, PA2, 0, \ | |||
| A6, PA3, 0, A7, PA3, 0 | |||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
| vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ | |||
| Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ | |||
| Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ | |||
| Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CGEMV_N_1x4 | |||
| GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 | |||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
| vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ | |||
| Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ | |||
| Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ | |||
| Y0, X3, A6, Y0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CGEMV_N_1x1 | |||
| fld.d $f12, PA0, 0 | |||
| PTR_ADDI PA0, PA0, 0x08 | |||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
| vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req | |||
| PTR_SRLI J, N, 2 | |||
| beqz J, .L_\XW\()_N_3 | |||
| PTR_SLLI K_LDA, LDA, 2 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| .L_\XW\()_N_L4: | |||
| CLOAD_\X_4 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| PTR_SRLI I, M, 2 | |||
| beqz I, .L_\XW\()_M_3 | |||
| .align 5 | |||
| .L_\XW\()_M_L4: | |||
| CLOAD_\Y_4 | |||
| CGEMV_N_4x4 | |||
| CSTORE_\Y_4 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL Y, INC_Y, Y, 2 | |||
| PTR_ADDI K, K, 4 | |||
| bnez I, .L_\XW\()_M_L4 | |||
| .L_\XW\()_M_3: | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_M_END | |||
| .align 5 | |||
| .L_\XW\()_M_L1: | |||
| CLOAD_\Y_1 | |||
| CGEMV_N_1x4 | |||
| CSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_M_L1 | |||
| .L_\XW\()_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #else | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #endif | |||
| PTR_ALSL X, INC_X, X, 2 | |||
| bnez J, .L_\XW\()_N_L4 | |||
| .L_\XW\()_N_3: | |||
| andi J, N, 3 | |||
| beqz J, .L_END | |||
| .L_\XW\()_N_L1: | |||
| CLOAD_\X_1 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| move I, M | |||
| beqz I, .L_END | |||
| .align 5 | |||
| .L_\XW\()_N_1_M_L1: | |||
| CLOAD_\Y_1 | |||
| CGEMV_N_1x1 | |||
| CSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_N_1_M_L1 | |||
| .L_\XW\()_N_1_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| PTR_SUB K_LDA, LDA, M8 | |||
| PTR_ADD PA0, PA0, K_LDA | |||
| PTR_ADD X, X, INC_X | |||
| bnez J, .L_\XW\()_N_L1 | |||
| b .L_END | |||
| .endm | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 7, 31 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| PTR_SUB J, INC_Y, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ | |||
| PTR_ALSL I, I, J, 1 | |||
| GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||
| // Init VALPHA | |||
| vpackev.w $vr0, $vr1, $vr0 | |||
| vpackev.d VALPHA, $vr0, $vr0 | |||
| move Y_ORG, Y | |||
| move PA0, A | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
| #else | |||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
| #endif | |||
| la.local T0, .L_GAP_TABLE | |||
| PTR_ALSL I, I, T0, 1 | |||
| ld.h K, I, 0 // Obtain the offset address | |||
| PTR_ADD T0, T0, K | |||
| jirl $r0, T0, 0 | |||
| .L_GAP_TABLE: | |||
| .hword .L_GAP_0_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_0_1 - .L_GAP_TABLE | |||
| .hword .L_GAP_1_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_1_1 - .L_GAP_TABLE | |||
| .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ | |||
| CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1 | |||
| .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ | |||
| CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1 | |||
| .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ | |||
| CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1 | |||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
| CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 | |||
| .L_END: | |||
| pop_if_used 17 + 7, 31 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -122,14 +122,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ | |||
| X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X3, X3, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X4, X4, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X5, X5, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X6, X6, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X7, X7, VALPHA, TMP0, TMP1, TMP2 | |||
| xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||
| X3, VALPHA, X3, TMP0, TMP1, TMP2, \ | |||
| X4, VALPHA, X4, TMP0, TMP1, TMP2, \ | |||
| X5, VALPHA, X5, TMP0, TMP1, TMP2, \ | |||
| X6, VALPHA, X6, TMP0, TMP1, TMP2, \ | |||
| X7, VALPHA, X7, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CLOAD_X_8_GAP | |||
| @@ -150,14 +150,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvldrepl.d X7, T0, 0x00 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X3, X3, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X4, X4, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X5, X5, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X6, X6, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X7, X7, VALPHA, TMP0, TMP1, TMP2 | |||
| xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||
| X3, VALPHA, X3, TMP0, TMP1, TMP2, \ | |||
| X4, VALPHA, X4, TMP0, TMP1, TMP2, \ | |||
| X5, VALPHA, X5, TMP0, TMP1, TMP2, \ | |||
| X6, VALPHA, X6, TMP0, TMP1, TMP2, \ | |||
| X7, VALPHA, X7, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CLOAD_Y_8 | |||
| @@ -228,7 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro CLOAD_X_1 | |||
| GLDREPL xv, d, X0, X, 0x00 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2 | |||
| xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CLOAD_Y_1 | |||
| @@ -0,0 +1,290 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define ALPHA_R $f0 | |||
| #define ALPHA_I $f1 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INC_X $r10 | |||
| #define Y $r11 | |||
| #define INC_Y $r6 | |||
| #define J $r12 | |||
| #define I $r13 | |||
| #define K $r14 | |||
| #define PY0 $r14 | |||
| #define X_ORG $r15 | |||
| #define PY1 $r16 | |||
| #define K_LDA $r17 | |||
| #define PY2 $r18 | |||
| #define T0 $r19 | |||
| #define PA0 $r20 | |||
| #define PA1 $r23 | |||
| #define PA2 $r24 | |||
| #define PA3 $r25 | |||
| #define PA4 $r26 | |||
| #define PA5 $r27 | |||
| #define PA6 $r28 | |||
| #define PA7 $r29 | |||
| #define M8 $r30 | |||
| #define VALPHA $vr0 | |||
| #define X0 $vr1 | |||
| #define X1 $vr2 | |||
| #define A0 $vr3 | |||
| #define A1 $vr4 | |||
| #define A2 $vr5 | |||
| #define A3 $vr6 | |||
| #define A4 $vr7 | |||
| #define A5 $vr8 | |||
| #define A6 $vr9 | |||
| #define A7 $vr10 | |||
| #define A8 $vr11 | |||
| #define A9 $vr12 | |||
| #define A10 $vr13 | |||
| #define A11 $vr14 | |||
| #define A12 $vr15 | |||
| #define A13 $vr16 | |||
| #define A14 $vr17 | |||
| #define A15 $vr18 | |||
| #define TP0 $vr19 | |||
| #define TP1 $vr20 | |||
| #define TP2 $vr21 | |||
| #define TP3 $vr22 | |||
| #define TP4 $vr23 | |||
| #define TP5 $vr24 | |||
| #define TP6 $vr25 | |||
| #define TP7 $vr26 | |||
| #define TMP0 $vr27 | |||
| #define TMP1 $vr28 | |||
| #define TMP2 $vr29 | |||
| #define Y0 $vr3 | |||
| #define Y1 $vr4 | |||
| #define Y2 $vr5 | |||
| #define Y3 $vr6 | |||
| #define Y4 $vr7 | |||
| #define Y5 $vr8 | |||
| #define Y6 $vr9 | |||
| #define Y7 $vr10 | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| #define GXCONJ1 0 | |||
| #define GCONJ1 0 | |||
| #else | |||
| #define GXCONJ1 1 | |||
| #define GCONJ1 0 | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| #define GXCONJ2 0 | |||
| #define GCONJ2 0 | |||
| #else | |||
| #define GXCONJ2 0 | |||
| #define GCONJ2 1 | |||
| #endif | |||
| .macro ZERO_Y4 | |||
| GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 | |||
| .endm | |||
| .macro ZERO_Y1 | |||
| GXOR v, v, TP0, TP0, TP0 | |||
| .endm | |||
| .macro CLOAD_X4 | |||
| GLD v, , X0, X, 0x00, X1, X, 0x10 | |||
| .endm | |||
| .macro CLOAD_X4_GAP | |||
| fld.d $f1, X, 0x00 | |||
| fldx.d $f3, X, INC_X | |||
| PTR_ALSL T0, INC_X, X, 1 | |||
| fld.d $f2, T0, 0x00 | |||
| fldx.d $f4, T0, INC_X | |||
| vpackev.d X0, A0, X0 | |||
| vpackev.d X1, A1, X1 | |||
| .endm | |||
| .macro CGEMV_T_4x4 | |||
| GLD_INC v, , 0x10, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0, \ | |||
| A4, PA2, 0, A5, PA2, 0, \ | |||
| A6, PA3, 0, A7, PA3, 0 | |||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
| vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ | |||
| TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ | |||
| TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ | |||
| TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CGEMV_T_LSX XW:req, X4:req | |||
| PTR_SRLI J, N, 2 | |||
| beqz J, .L_\XW\()_N_3 | |||
| PTR_SLLI K_LDA, LDA, 2 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| .L_\XW\()_N_L4: | |||
| ZERO_Y4 | |||
| move X, X_ORG | |||
| PTR_SRLI I, M, 2 | |||
| beqz I, .L_\XW\()_M_3 | |||
| .align 5 | |||
| .L_\XW\()_M_L4: | |||
| CLOAD_\X4 | |||
| CGEMV_T_4x4 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL X, INC_X, X, 2 | |||
| bnez I, .L_\XW\()_M_L4 | |||
| .L_\XW\()_M_3: | |||
| // Accumulated | |||
| GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_M_END | |||
| .align 5 | |||
| .L_\XW\()_M_L1: | |||
| fld.d $f1, X, 0x00 | |||
| fld.d $f11, PA0, 0x00 | |||
| fld.d $f12, PA1, 0x00 | |||
| fld.d $f13, PA2, 0x00 | |||
| fld.d $f14, PA3, 0x00 | |||
| #if __loongarch_grlen == 64 | |||
| GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 | |||
| #elif __loongarch_grlen == 32 | |||
| GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 | |||
| #else | |||
| GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 | |||
| #endif | |||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
| vf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ | |||
| A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| bnez I, .L_\XW\()_M_L1 | |||
| .L_\XW\()_M_END: | |||
| fld.d $f11, Y, 0x00 | |||
| fldx.d $f12, Y, INC_Y | |||
| PTR_ALSL PY0, INC_Y, Y, 1 | |||
| fld.d $f13, PY0, 0x00 | |||
| fldx.d $f14, PY0, INC_Y | |||
| GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||
| vf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ | |||
| A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2 | |||
| PTR_ADDI J, J, -1 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #else | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #endif | |||
| fst.d $f11, Y, 0x00 | |||
| fstx.d $f12, Y, INC_Y | |||
| fst.d $f13, PY0, 0x00 | |||
| fstx.d $f14, PY0, INC_Y | |||
| PTR_ALSL Y, INC_Y, Y, 2 | |||
| bnez J, .L_\XW\()_N_L4 | |||
| .L_\XW\()_N_3: | |||
| andi J, N, 3 | |||
| beqz J, .L_END | |||
| PTR_SUB K_LDA, LDA, M8 | |||
| .L_\XW\()_N_1: | |||
| ZERO_Y1 | |||
| move X, X_ORG | |||
| move I, M | |||
| beqz I, .L_END | |||
| .align 5 | |||
| .L_\XW\()_N_1_M_L1: | |||
| fld.d $f3, PA0, 0x00 | |||
| fld.d $f1, X, 0x00 | |||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
| vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| PTR_ADDI PA0, PA0, 0x08 | |||
| bnez I, .L_\XW\()_N_1_M_L1 | |||
| .L_\XW\()_N_1_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| fld.d $f3, Y, 0x00 | |||
| GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||
| vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 | |||
| fst.d $f3, Y, 0x00 | |||
| PTR_ADD PA0, PA0, K_LDA | |||
| PTR_ADD Y, Y, INC_Y | |||
| bnez J, .L_\XW\()_N_1 | |||
| b .L_END | |||
| .endm | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 8, 30 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||
| // Init VALPHA | |||
| vpackev.w $vr0, $vr1, $vr0 | |||
| vpackev.d VALPHA, $vr0, $vr0 | |||
| move X_ORG, X | |||
| move PA0, A | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
| #else | |||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
| #endif | |||
| la.local T0, .L_GAP_TABLE | |||
| PTR_ALSL I, I, T0, 1 | |||
| ld.h K, I, 0 | |||
| PTR_ADD T0, T0, K | |||
| jirl $r0, T0, 0 | |||
| .L_GAP_TABLE: | |||
| .hword .L_GAP_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_1 - .L_GAP_TABLE | |||
| .L_GAP_0: /* if (incx == 1) */ | |||
| CGEMV_T_LSX GAP_0, X4 | |||
| .L_GAP_1: /* if (incx != 1) */ | |||
| CGEMV_T_LSX GAP_1, X4_GAP | |||
| .L_END: | |||
| pop_if_used 17 + 8, 30 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -406,9 +406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .ifeqs "\suf_op", "s" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .else | |||
| vor.v \out, \in, \in | |||
| .endif | |||
| .endif | |||
| .ifnb \more | |||
| GCOMPLEXACC \pre_op, \suf_op, \more | |||
| .endif | |||
| @@ -0,0 +1,296 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define ALPHA_R $f0 | |||
| #define ALPHA_I $f1 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INC_X $r10 | |||
| #define Y $r11 | |||
| #define INC_Y $r6 | |||
| #define J $r12 | |||
| #define I $r13 | |||
| #define K $r14 | |||
| #define Y_ORG $r15 | |||
| #define OFFSET $r16 | |||
| #define K_LDA $r17 | |||
| #define M16 $r18 | |||
| #define T0 $r19 | |||
| #define PA0 $r20 | |||
| #define PA1 $r23 | |||
| #define PA2 $r24 | |||
| #define PA3 $r25 | |||
| #define PA4 $r26 | |||
| #define PA5 $r27 | |||
| #define PA6 $r28 | |||
| #define PA7 $r29 | |||
| #define VALPHA $vr1 | |||
| #define X0 $vr2 | |||
| #define X1 $vr3 | |||
| #define X2 $vr4 | |||
| #define X3 $vr5 | |||
| #define X4 $vr6 | |||
| #define X5 $vr7 | |||
| #define X6 $vr8 | |||
| #define X7 $vr9 | |||
| #define Y0 $vr10 | |||
| #define Y1 $vr11 | |||
| #define A0 $vr12 | |||
| #define A1 $vr13 | |||
| #define A2 $vr14 | |||
| #define A3 $vr15 | |||
| #define A4 $vr16 | |||
| #define A5 $vr17 | |||
| #define A6 $vr18 | |||
| #define A7 $vr19 | |||
| #define A8 $vr20 | |||
| #define A9 $vr21 | |||
| #define A10 $vr22 | |||
| #define A11 $vr23 | |||
| #define A12 $vr24 | |||
| #define A13 $vr25 | |||
| #define A14 $vr26 | |||
| #define A15 $vr27 | |||
| #define TMP0 $vr28 | |||
| #define TMP1 $vr29 | |||
| #define TMP2 $vr30 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| #define GXCONJ 0 | |||
| #define GCONJ 0 | |||
| #else | |||
| #define GXCONJ 1 | |||
| #define GCONJ 0 | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| #define GXCONJ 0 | |||
| #define GCONJ 1 | |||
| #else | |||
| #define GXCONJ 1 | |||
| #define GCONJ 1 | |||
| #endif | |||
| #endif | |||
| .macro ZLOAD_X_2 | |||
| GLD v, , X0, X, 0x00, X1, X, 0x10 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZLOAD_X_2_GAP | |||
| vld X0, X, 0 | |||
| PTR_ADD T0, X, INC_X | |||
| vld X1, T0, 0 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZLOAD_X_1 | |||
| GLD v, , X0, X, 0x00 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZLOAD_Y_2 | |||
| GLD v, , Y0, Y, 0, Y1, Y, 0x10 | |||
| .endm | |||
| .macro ZLOAD_Y_2_GAP | |||
| vld $vr10, Y, 0 | |||
| vldx $vr11, Y, INC_Y | |||
| .endm | |||
| .macro ZLOAD_Y_1 | |||
| vld $vr10, Y, 0 | |||
| .endm | |||
| .macro ZGEMV_N_2x2 | |||
| GLD_INC v, , 0x10, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0 | |||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
| vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ | |||
| Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZGEMV_N_1x2 | |||
| GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0 | |||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
| vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ | |||
| Y0, X1, A2, Y0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZGEMV_N_1x1 | |||
| GLD_INC v, , 0x10, $vr12, PA0, 0 | |||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
| vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZSTORE_Y_2 | |||
| GST v, , Y0, Y, 0, Y1, Y, 0x10 | |||
| .endm | |||
| .macro ZSTORE_Y_2_GAP | |||
| vst Y0, Y, 0 | |||
| vstx Y1, Y, INC_Y | |||
| .endm | |||
| .macro ZSTORE_Y_1 | |||
| vst $vr10, Y, 0 | |||
| .endm | |||
| .macro ZGEMV_N_LSX XW:req, X_2:req, X_1:req, Y_2:req, Y_1:req | |||
| PTR_SRLI J, N, 1 | |||
| beqz J, .L_\XW\()_N_1 | |||
| PTR_SLLI K_LDA, LDA, 1 | |||
| PTR_SUB K_LDA, K_LDA, M16 | |||
| .L_\XW\()_N_L2: | |||
| ZLOAD_\X_2 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| PTR_SRLI I, M, 1 | |||
| beqz I, .L_\XW\()_M_1 | |||
| .align 5 | |||
| .L_\XW\()_M_L2: | |||
| ZLOAD_\Y_2 | |||
| ZGEMV_N_2x2 | |||
| ZSTORE_\Y_2 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL Y, INC_Y, Y, 1 | |||
| PTR_ADDI K, K, 4 | |||
| bnez I, .L_\XW\()_M_L2 | |||
| .L_\XW\()_M_1: | |||
| andi I, M, 1 | |||
| beqz I, .L_\XW\()_M_END | |||
| .align 5 | |||
| .L_\XW\()_M_L1: | |||
| ZLOAD_\Y_1 | |||
| ZGEMV_N_1x2 | |||
| ZSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_M_L1 | |||
| .L_\XW\()_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #else | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #endif | |||
| PTR_ALSL X, INC_X, X, 1 | |||
| bnez J, .L_\XW\()_N_L2 | |||
| .L_\XW\()_N_1: | |||
| andi J, N, 1 | |||
| beqz J, .L_END | |||
| .L_\XW\()_N_L1: | |||
| ZLOAD_\X_1 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| move I, M | |||
| beqz I, .L_END | |||
| .align 5 | |||
| .L_\XW\()_N_1_M_L1: | |||
| ZLOAD_\Y_1 | |||
| ZGEMV_N_1x1 | |||
| ZSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_N_1_M_L1 | |||
| .L_\XW\()_N_1_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| PTR_SUB K_LDA, LDA, M16 | |||
| PTR_ADD PA0, PA0, K_LDA | |||
| PTR_ADD X, X, INC_X | |||
| bnez J, .L_\XW\()_N_L1 | |||
| b .L_END | |||
| .endm | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 7, 31 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| PTR_SUB J, INC_Y, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ | |||
| PTR_ALSL I, I, J, 1 | |||
| GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 | |||
| // Init VALPHA | |||
| vpackev.d VALPHA, $vr1, $vr0 | |||
| move Y_ORG, Y | |||
| move PA0, A | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA1, PA0, LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA1, PA0, LDA | |||
| #else | |||
| GADD , d, PA1, PA0, LDA | |||
| #endif | |||
| la.local T0, .L_GAP_TABLE | |||
| PTR_ALSL I, I, T0, 1 | |||
| ld.h K, I, 0 // Obtain the offset address | |||
| PTR_ADD T0, T0, K | |||
| jirl $r0, T0, 0 | |||
| .L_GAP_TABLE: | |||
| .hword .L_GAP_0_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_0_1 - .L_GAP_TABLE | |||
| .hword .L_GAP_1_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_1_1 - .L_GAP_TABLE | |||
| .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ | |||
| ZGEMV_N_LSX GAP_0_0, X_2, X_1, Y_2, Y_1 | |||
| .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ | |||
| ZGEMV_N_LSX GAP_0_1, X_2, X_1, Y_2_GAP, Y_1 | |||
| .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ | |||
| ZGEMV_N_LSX GAP_1_0, X_2_GAP, X_1, Y_2, Y_1 | |||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
| ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1 | |||
| .L_END: | |||
| pop_if_used 17 + 7, 31 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -122,10 +122,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30 | |||
| GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X3, X3, VALPHA, TMP0, TMP1, TMP2 | |||
| xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||
| X3, VALPHA, X3, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZLOAD_X_4_GAP | |||
| @@ -145,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvpermi.q X3, X3, 0 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X3, X3, VALPHA, TMP0, TMP1, TMP2 | |||
| xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||
| X3, VALPHA, X3, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZLOAD_Y_4 | |||
| @@ -216,7 +216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| GLD xv, , X0, X, 0x00 | |||
| GPERMI xv, q, X0, X0, 0 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2 | |||
| xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZGEMV_N_1x1 | |||
| @@ -0,0 +1,268 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define ALPHA_R $f0 | |||
| #define ALPHA_I $f1 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INC_X $r10 | |||
| #define Y $r11 | |||
| #define INC_Y $r6 | |||
| #define J $r12 | |||
| #define I $r13 | |||
| #define K $r14 | |||
| #define PY0 $r14 | |||
| #define X_ORG $r15 | |||
| #define PY1 $r16 | |||
| #define K_LDA $r17 | |||
| #define PY2 $r18 | |||
| #define T0 $r19 | |||
| #define PA0 $r20 | |||
| #define PA1 $r23 | |||
| #define PA2 $r24 | |||
| #define PA3 $r25 | |||
| #define PA4 $r26 | |||
| #define PA5 $r27 | |||
| #define PA6 $r28 | |||
| #define PA7 $r29 | |||
| #define M16 $r30 | |||
| #define VALPHA $vr0 | |||
| #define X0 $vr1 | |||
| #define X1 $vr2 | |||
| #define A0 $vr3 | |||
| #define A1 $vr4 | |||
| #define A2 $vr5 | |||
| #define A3 $vr6 | |||
| #define A4 $vr7 | |||
| #define A5 $vr8 | |||
| #define A6 $vr9 | |||
| #define A7 $vr10 | |||
| #define A8 $vr11 | |||
| #define A9 $vr12 | |||
| #define A10 $vr13 | |||
| #define A11 $vr14 | |||
| #define A12 $vr15 | |||
| #define A13 $vr16 | |||
| #define A14 $vr17 | |||
| #define A15 $vr18 | |||
| #define TP0 $vr19 | |||
| #define TP1 $vr20 | |||
| #define TP2 $vr21 | |||
| #define TP3 $vr22 | |||
| #define TP4 $vr23 | |||
| #define TP5 $vr24 | |||
| #define TP6 $vr25 | |||
| #define TP7 $vr26 | |||
| #define TMP0 $vr27 | |||
| #define TMP1 $vr28 | |||
| #define TMP2 $vr29 | |||
| #define Y0 $vr3 | |||
| #define Y1 $vr4 | |||
| #define Y2 $vr5 | |||
| #define Y3 $vr6 | |||
| #define Y4 $vr7 | |||
| #define Y5 $vr8 | |||
| #define Y6 $vr9 | |||
| #define Y7 $vr10 | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| #define GXCONJ1 0 | |||
| #define GCONJ1 0 | |||
| #else | |||
| #define GXCONJ1 1 | |||
| #define GCONJ1 0 | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| #define GXCONJ2 0 | |||
| #define GCONJ2 0 | |||
| #else | |||
| #define GXCONJ2 0 | |||
| #define GCONJ2 1 | |||
| #endif | |||
| .macro ZERO_Y2 | |||
| GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1 | |||
| .endm | |||
| .macro ZERO_Y1 | |||
| GXOR v, v, TP0, TP0, TP0 | |||
| .endm | |||
| .macro ZLOAD_X2 | |||
| GLD v, , X0, X, 0x00, X1, X, 0x10 | |||
| .endm | |||
| .macro ZLOAD_X2_GAP | |||
| vld X0, X, 0 | |||
| vldx X1, X, INC_X | |||
| .endm | |||
| .macro ZGEMV_T_2x2 | |||
| GLD_INC v, , 0x10, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0 | |||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
| vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ | |||
| TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZGEMV_T_LSX XW:req, X2:req | |||
| PTR_SRLI J, N, 1 | |||
| beqz J, .L_\XW\()_N_1 | |||
| PTR_SLLI K_LDA, LDA, 1 | |||
| PTR_SUB K_LDA, K_LDA, M16 | |||
| .L_\XW\()_N_L2: | |||
| ZERO_Y2 | |||
| move X, X_ORG | |||
| PTR_SRLI I, M, 1 | |||
| beqz I, .L_\XW\()_M_1 | |||
| .align 5 | |||
| .L_\XW\()_M_L2: | |||
| ZLOAD_\X2 | |||
| ZGEMV_T_2x2 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL X, INC_X, X, 1 | |||
| bnez I, .L_\XW\()_M_L2 | |||
| .L_\XW\()_M_1: | |||
| // Accumulated | |||
| GCOMPLEXACC vf, d, Y0, TP0, Y1, TP1 | |||
| andi I, M, 1 | |||
| beqz I, .L_\XW\()_M_END | |||
| .align 5 | |||
| .L_\XW\()_M_L1: | |||
| GLD v, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00 | |||
| #if __loongarch_grlen == 64 | |||
| GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10 | |||
| #elif __loongarch_grlen == 32 | |||
| GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10 | |||
| #else | |||
| GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10 | |||
| #endif | |||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
| vf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| bnez I, .L_\XW\()_M_L1 | |||
| .L_\XW\()_M_END: | |||
| vld A8, Y, 0x00 | |||
| vldx A9, Y, INC_Y | |||
| GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||
| vf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2 | |||
| PTR_ADDI J, J, -1 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #else | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #endif | |||
| vst $vr11, Y, 0x00 | |||
| vstx $vr12, Y, INC_Y | |||
| PTR_ALSL Y, INC_Y, Y, 1 | |||
| bnez J, .L_\XW\()_N_L2 | |||
| .L_\XW\()_N_1: | |||
| andi J, N, 1 | |||
| beqz J, .L_END | |||
| PTR_SUB K_LDA, LDA, M16 | |||
| .L_\XW\()_N_L1: | |||
| ZERO_Y1 | |||
| move X, X_ORG | |||
| move I, M | |||
| beqz I, .L_END | |||
| .align 5 | |||
| .L_\XW\()_N_1_M_L1: | |||
| GLD v, , A0, PA0, 0x00, X0, X, 0x00 | |||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
| vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| PTR_ADDI PA0, PA0, 0x10 | |||
| bnez I, .L_\XW\()_N_1_M_L1 | |||
| .L_\XW\()_N_1_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| vld A0, Y, 0x00 | |||
| GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||
| vf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 | |||
| vst $vr3, Y, 0x00 | |||
| PTR_ADD PA0, PA0, K_LDA | |||
| PTR_ADD Y, Y, INC_Y | |||
| bnez J, .L_\XW\()_N_L1 | |||
| b .L_END | |||
| .endm | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 8, 30 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 | |||
| // Init VALPHA | |||
| vpackev.d VALPHA, $vr1, $vr0 | |||
| move X_ORG, X | |||
| move PA0, A | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA1, PA0, LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA1, PA0, LDA | |||
| #else | |||
| GADD , d, PA1, PA0, LDA | |||
| #endif | |||
| la.local T0, .L_GAP_TABLE | |||
| PTR_ALSL I, I, T0, 1 | |||
| ld.h K, I, 0 | |||
| PTR_ADD T0, T0, K | |||
| jirl $r0, T0, 0 | |||
| .L_GAP_TABLE: | |||
| .hword .L_GAP_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_1 - .L_GAP_TABLE | |||
| .L_GAP_0: /* if (incx == 1) */ | |||
| ZGEMV_T_LSX GAP_0, X2 | |||
| .L_GAP_1: /* if (incx != 1) */ | |||
| ZGEMV_T_LSX GAP_1, X2_GAP | |||
| .L_END: | |||
| pop_if_used 17 + 8, 30 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||