| @@ -10,9 +10,6 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| DGEMVNKERNEL = dgemv_n_8_lasx.S | |||
| DGEMVTKERNEL = dgemv_t_8_lasx.S | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| @@ -132,16 +132,12 @@ CSWAPKERNEL = ../arm/zswap.c | |||
| ZSWAPKERNEL = ../arm/zswap.c | |||
| SGEMVNKERNEL = ../arm/gemv_n.c | |||
| ifndef DGEMVNKERNEL | |||
| DGEMVNKERNEL = ../arm/gemv_n.c | |||
| endif | |||
| CGEMVNKERNEL = ../arm/zgemv_n.c | |||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | |||
| SGEMVTKERNEL = ../arm/gemv_t.c | |||
| ifndef DGEMVTKERNEL | |||
| DGEMVTKERNEL = ../arm/gemv_t.c | |||
| endif | |||
| CGEMVTKERNEL = ../arm/zgemv_t.c | |||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | |||
| @@ -1,546 +0,0 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /********************************************************************* | |||
| * 2023/07/14 guxiwei | |||
| * UTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * | |||
| * | |||
| *********************************************************************/ | |||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, | |||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define ALPHA $f0 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INC_X $r10 | |||
| #define Y $r11 | |||
| #define INC_Y $r6 | |||
| #define J $r12 | |||
| #define I $r13 | |||
| #define K $r14 | |||
| #define Y_ORG $r15 | |||
| #define OFFSET $r16 | |||
| #define K_LDA $r17 | |||
| #define M8 $r18 | |||
| #define T0 $r19 | |||
| #define PA0 $r20 | |||
| #define PA1 $r23 | |||
| #define PA2 $r24 | |||
| #define PA3 $r25 | |||
| #define PA4 $r26 | |||
| #define PA5 $r27 | |||
| #define PA6 $r28 | |||
| #define PA7 $r29 | |||
| #define VALPHA $xr1 | |||
| #define X0 $xr2 | |||
| #define X1 $xr3 | |||
| #define X2 $xr4 | |||
| #define X3 $xr5 | |||
| #define X4 $xr6 | |||
| #define X5 $xr7 | |||
| #define X6 $xr8 | |||
| #define X7 $xr9 | |||
| #define Y0 $xr10 | |||
| #define Y1 $xr11 | |||
| #define A0 $xr12 | |||
| #define A1 $xr13 | |||
| #define A2 $xr14 | |||
| #define A3 $xr15 | |||
| #define A4 $xr16 | |||
| #define A5 $xr17 | |||
| #define A6 $xr18 | |||
| #define A7 $xr19 | |||
| #define A8 $xr20 | |||
| #define A9 $xr21 | |||
| #define A10 $xr22 | |||
| #define A11 $xr23 | |||
| #define A12 $xr24 | |||
| #define A13 $xr25 | |||
| #define A14 $xr26 | |||
| #define A15 $xr27 | |||
| .macro DLOAD_X_8 | |||
| GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ | |||
| X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 | |||
| GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ | |||
| X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA | |||
| .endm | |||
| .macro DLOAD_X_4 | |||
| GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 | |||
| GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA | |||
| .endm | |||
| .macro DLOAD_X_2 | |||
| GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08 | |||
| GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA | |||
| .endm | |||
| .macro DLOAD_X_1 | |||
| GLDREPL xv, d, X0, X, 0x00 | |||
| GMUL xvf, d, X0, X0, VALPHA | |||
| .endm | |||
| .macro DLOAD_Y_8 | |||
| GLD xv, , Y0, Y, 0, Y1, Y, 0x20 | |||
| .endm | |||
| .macro DLOAD_Y_4 | |||
| GLD xv, , Y0, Y, 0 | |||
| .endm | |||
| .macro DLOAD_Y_1 | |||
| fld.d $f10, Y, 0 | |||
| .endm | |||
| .macro DSTORE_Y_8 | |||
| GST xv, , Y0, Y, 0, Y1, Y, 0x20 | |||
| .endm | |||
| .macro DSTORE_Y_4 | |||
| GST xv, , Y0, Y, 0 | |||
| .endm | |||
| .macro DSTORE_Y_1 | |||
| fst.d $f10, Y, 0 | |||
| .endm | |||
| // Unable to use vector load/store ins | |||
| .macro DLOAD_Y_8_GAP | |||
| fld.d $f10, Y, 0 | |||
| fldx.d $f13, Y, INC_Y | |||
| PTR_ALSL T0, INC_Y, Y, 1 | |||
| fld.d $f14, T0, 0 | |||
| fldx.d $f15, T0, INC_Y | |||
| PTR_ALSL T0, INC_Y, Y, 2 | |||
| fld.d $f11, T0, 0 | |||
| fldx.d $f17, T0, INC_Y | |||
| PTR_ADD T0, T0, INC_Y | |||
| PTR_ADD T0, T0, INC_Y | |||
| fld.d $f18, T0, 0 | |||
| fldx.d $f19, T0, INC_Y | |||
| GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 | |||
| .endm | |||
| .macro DLOAD_Y_4_GAP | |||
| fld.d $f10, Y, 0 | |||
| fldx.d $f13, Y, INC_Y | |||
| PTR_ALSL T0, INC_Y, Y, 1 | |||
| fld.d $f14, T0, 0 | |||
| fldx.d $f15, T0, INC_Y | |||
| GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3 | |||
| .endm | |||
| .macro DSTORE_Y_8_GAP | |||
| xvstelm.d Y0, Y, 0, 0 | |||
| PTR_ADD T0, Y, INC_Y | |||
| xvstelm.d Y0, T0, 0, 1 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y0, T0, 0, 2 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y0, T0, 0, 3 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y1, T0, 0, 0 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y1, T0, 0, 1 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y1, T0, 0, 2 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y1, T0, 0, 3 | |||
| .endm | |||
| .macro DSTORE_Y_4_GAP | |||
| xvstelm.d Y0, Y, 0, 0 | |||
| PTR_ADD T0, Y, INC_Y | |||
| xvstelm.d Y0, T0, 0, 1 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y0, T0, 0, 2 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y0, T0, 0, 3 | |||
| .endm | |||
| .macro DLOAD_X_8_GAP | |||
| xvldrepl.d X0, X, 0x00 | |||
| PTR_ADD T0, X, INC_X | |||
| xvldrepl.d X1, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X2, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X3, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X4, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X5, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X6, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X7, T0, 0x00 | |||
| GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ | |||
| X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA | |||
| .endm | |||
| .macro DLOAD_X_4_GAP | |||
| xvldrepl.d X0, X, 0x00 | |||
| PTR_ADD T0, X, INC_X | |||
| xvldrepl.d X1, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X2, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X3, T0, 0x00 | |||
| GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA | |||
| .endm | |||
| .macro DLOAD_X_2_GAP | |||
| xvldrepl.d X0, X, 0x00 | |||
| PTR_ADD T0, X, INC_X | |||
| xvldrepl.d X1, T0, 0x00 | |||
| GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA | |||
| .endm | |||
| .macro DGEMV_N_8x8 | |||
| GLD_INC xv, , 0x20, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0, \ | |||
| A4, PA2, 0, A5, PA2, 0, \ | |||
| A6, PA3, 0, A7, PA3, 0, \ | |||
| A8, PA4, 0, A9, PA4, 0, \ | |||
| A10, PA5, 0, A11, PA5, 0, \ | |||
| A12, PA6, 0, A13, PA6, 0, \ | |||
| A14, PA7, 0, A15, PA7, 0 | |||
| GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ | |||
| Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ | |||
| Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ | |||
| Y0, A6, X3, Y0, Y1, A7, X3, Y1, \ | |||
| Y0, A8, X4, Y0, Y1, A9, X4, Y1, \ | |||
| Y0, A10, X5, Y0, Y1, A11, X5, Y1, \ | |||
| Y0, A12, X6, Y0, Y1, A13, X6, Y1, \ | |||
| Y0, A14, X7, Y0, Y1, A15, X7, Y1 | |||
| .endm | |||
| .macro DGEMV_N_4x8 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, \ | |||
| A2, PA1, 0, \ | |||
| A4, PA2, 0, \ | |||
| A6, PA3, 0, \ | |||
| A8, PA4, 0, \ | |||
| A10, PA5, 0, \ | |||
| A12, PA6, 0, \ | |||
| A14, PA7, 0 | |||
| GMADD xvf, d, Y0, A0, X0, Y0, \ | |||
| Y0, A2, X1, Y0, \ | |||
| Y0, A4, X2, Y0, \ | |||
| Y0, A6, X3, Y0, \ | |||
| Y0, A8, X4, Y0, \ | |||
| Y0, A10, X5, Y0, \ | |||
| Y0, A12, X6, Y0, \ | |||
| Y0, A14, X7, Y0 | |||
| .endm | |||
| .macro DGEMV_N_1x8 | |||
| GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ | |||
| $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 | |||
| GMADD f, d, $f10, $f12, $f2, $f10, \ | |||
| $f10, $f14, $f3, $f10, \ | |||
| $f10, $f16, $f4, $f10, \ | |||
| $f10, $f18, $f5, $f10, \ | |||
| $f10, $f20, $f6, $f10, \ | |||
| $f10, $f22, $f7, $f10, \ | |||
| $f10, $f24, $f8, $f10, \ | |||
| $f10, $f26, $f9, $f10, | |||
| .endm | |||
| .macro DGEMV_N_8x4 | |||
| GLD_INC xv, , 0x20, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0, \ | |||
| A4, PA2, 0, A5, PA2, 0, \ | |||
| A6, PA3, 0, A7, PA3, 0 | |||
| GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ | |||
| Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ | |||
| Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ | |||
| Y0, A6, X3, Y0, Y1, A7, X3, Y1 | |||
| .endm | |||
| .macro DGEMV_N_4x4 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 | |||
| GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \ | |||
| Y0, A4, X2, Y0, Y0, A6, X3, Y0 | |||
| .endm | |||
| .macro DGEMV_N_1x4 | |||
| GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 | |||
| GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \ | |||
| $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10 | |||
| .endm | |||
| .macro DGEMV_N_8x2 | |||
| GLD_INC xv, , 0x20, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0 | |||
| GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ | |||
| Y0, A2, X1, Y0, Y1, A3, X1, Y1 | |||
| .endm | |||
| .macro DGEMV_N_4x2 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 | |||
| GMADD xvf, d, Y0, A0, X0, Y0, \ | |||
| Y0, A2, X1, Y0 | |||
| .endm | |||
| .macro DGEMV_N_1x2 | |||
| GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0 | |||
| GMADD f, d, $f10, $f12, $f2, $f10, \ | |||
| $f10, $f14, $f3, $f10 | |||
| .endm | |||
| .macro DGEMV_N_1x1 | |||
| fld.d $f12, PA0, 0 | |||
| PTR_ADDI PA0, PA0, 0x08 | |||
| fmadd.d $f10, $f12, $f2, $f10 | |||
| .endm | |||
| .macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req | |||
| PTR_SRLI J, N, 3 | |||
| beqz J, .L_\XW\()_N_7 | |||
| PTR_SLLI K_LDA, LDA, 3 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| .L_\XW\()_N_L8: | |||
| DLOAD_\X_8 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| PTR_SRLI I, M, 3 | |||
| beqz I, .L_\XW\()_M_7 | |||
| .align 5 | |||
| .L_\XW\()_M_L8: | |||
| DLOAD_\Y_8 | |||
| DGEMV_N_8x8 | |||
| DSTORE_\Y_8 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL Y, INC_Y, Y, 3 | |||
| PTR_ADDI K, K, 8 | |||
| bnez I, .L_\XW\()_M_L8 | |||
| .L_\XW\()_M_7: | |||
| andi I, M, 4 | |||
| beqz I, .L_\XW\()_M_3 | |||
| DLOAD_\Y_4 | |||
| DGEMV_N_4x8 | |||
| DSTORE_\Y_4 | |||
| PTR_ALSL Y, INC_Y, Y, 2 | |||
| PTR_ADDI K, K, 4 | |||
| .L_\XW\()_M_3: | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_M_END | |||
| .align 5 | |||
| .L_\XW\()_M_L1: | |||
| DLOAD_\Y_1 | |||
| DGEMV_N_1x8 | |||
| DSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_M_L1 | |||
| .L_\XW\()_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
| PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
| #else | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
| PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
| #endif | |||
| PTR_ALSL X, INC_X, X, 3 | |||
| bnez J, .L_\XW\()_N_L8 | |||
| .L_\XW\()_N_7: | |||
| andi J, N, 4 | |||
| beqz J, .L_\XW\()_N_3 | |||
| DLOAD_\X_4 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| PTR_SRLI I, M, 3 | |||
| beqz I, .L_\XW\()_N_4_M_7 | |||
| .align 5 | |||
| .L_\XW\()_N_4_M_L8: | |||
| DLOAD_\Y_8 | |||
| DGEMV_N_8x4 | |||
| DSTORE_\Y_8 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADDI K, K, 8 | |||
| PTR_ALSL Y, INC_Y, Y, 3 | |||
| bnez I, .L_\XW\()_N_4_M_L8 | |||
| .L_\XW\()_N_4_M_7: | |||
| andi I, M, 4 | |||
| beqz I, .L_\XW\()_N_4_M_3 | |||
| DLOAD_\Y_4 | |||
| DGEMV_N_4x4 | |||
| DSTORE_\Y_4 | |||
| PTR_ALSL Y, INC_Y, Y, 2 | |||
| PTR_ADDI K, K, 4 | |||
| .L_\XW\()_N_4_M_3: | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_N_4_M_END | |||
| .align 5 | |||
| .L_\XW\()_N_4_M_L1: | |||
| DLOAD_\Y_1 | |||
| DGEMV_N_1x4 | |||
| DSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_N_4_M_L1 | |||
| .L_\XW\()_N_4_M_END: | |||
| PTR_SLLI K_LDA, LDA, 2 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #else | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #endif | |||
| PTR_ALSL X, INC_X, X, 2 | |||
| .L_\XW\()_N_3: | |||
| andi J, N, 2 | |||
| beqz J, .L_\XW\()_N_1 | |||
| DLOAD_\X_2 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| PTR_SRLI I, M, 3 | |||
| beqz I, .L_\XW\()_N_2_M_7 | |||
| .align 5 | |||
| .L_\XW\()_N_2_M_L8: | |||
| DLOAD_\Y_8 | |||
| DGEMV_N_8x2 | |||
| DSTORE_\Y_8 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADDI K, K, 8 | |||
| PTR_ALSL Y, INC_Y, Y, 3 | |||
| bnez I, .L_\XW\()_N_2_M_L8 | |||
| .L_\XW\()_N_2_M_7: | |||
| andi I, M, 4 | |||
| beqz I, .L_\XW\()_N_2_M_3 | |||
| DLOAD_\Y_4 | |||
| DGEMV_N_4x2 | |||
| DSTORE_\Y_4 | |||
| PTR_ALSL Y, INC_Y, Y, 2 | |||
| PTR_ADDI K, K, 4 | |||
| .L_\XW\()_N_2_M_3: | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_N_2_M_END | |||
| .align 5 | |||
| .L_\XW\()_N_2_M_L1: | |||
| DLOAD_\Y_1 | |||
| DGEMV_N_1x2 | |||
| DSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_N_2_M_L1 | |||
| .L_\XW\()_N_2_M_END: | |||
| PTR_SLLI K_LDA, LDA, 1 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| PTR_ADD PA0, PA0, K_LDA | |||
| PTR_ADD PA1, PA1, K_LDA | |||
| PTR_ALSL X, INC_X, X, 1 | |||
| .L_\XW\()_N_1: | |||
| andi J, N, 1 | |||
| beqz J, .L_END | |||
| DLOAD_\X_1 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| move I, M | |||
| beqz I, .L_END | |||
| .align 5 | |||
| .L_\XW\()_N_1_M_L1: | |||
| DLOAD_\Y_1 | |||
| DGEMV_N_1x1 | |||
| DSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_N_1_M_L1 | |||
| b .L_END | |||
| .endm | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 7, 24 + 4 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| PTR_SUB J, INC_Y, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ | |||
| PTR_ALSL I, I, J, 1 | |||
| GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||
| xvreplve0.d VALPHA, $xr0 | |||
| move Y_ORG, Y | |||
| move PA0, A | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
| PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
| #else | |||
| GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
| PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
| #endif | |||
| la.local T0, .L_GAP_TABLE | |||
| PTR_ALSL I, I, T0, 1 | |||
| ld.h K, I, 0 | |||
| PTR_ADD T0, T0, K | |||
| jirl $r0, T0, 0 | |||
| .L_GAP_TABLE: | |||
| .hword .L_GAP_0_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_0_1 - .L_GAP_TABLE | |||
| .hword .L_GAP_1_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_1_1 - .L_GAP_TABLE | |||
| .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ | |||
| DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 | |||
| .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ | |||
| DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 | |||
| .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ | |||
| DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 | |||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
| DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 | |||
| .L_END: | |||
| pop_if_used 17 + 7, 24 + 4 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -1,468 +0,0 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /********************************************************************* | |||
| * 2023/07/17 guxiwei | |||
| * UTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * | |||
| * | |||
| *********************************************************************/ | |||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, | |||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define ALPHA $f0 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INC_X $r10 | |||
| #define Y $r11 | |||
| #define INC_Y $r6 | |||
| #define J $r12 | |||
| #define I $r13 | |||
| #define K $r14 | |||
| #define PY0 $r14 | |||
| #define X_ORG $r15 | |||
| #define PY1 $r16 | |||
| #define K_LDA $r17 | |||
| #define PY2 $r18 | |||
| #define T0 $r19 | |||
| #define PA0 $r20 | |||
| #define PA1 $r23 | |||
| #define PA2 $r24 | |||
| #define PA3 $r25 | |||
| #define PA4 $r26 | |||
| #define PA5 $r27 | |||
| #define PA6 $r28 | |||
| #define PA7 $r29 | |||
| #define M8 $r30 | |||
| #define VALPHA $xr0 | |||
| #define X0 $xr1 | |||
| #define X1 $xr2 | |||
| #define A0 $xr3 | |||
| #define A1 $xr4 | |||
| #define A2 $xr5 | |||
| #define A3 $xr6 | |||
| #define A4 $xr7 | |||
| #define A5 $xr8 | |||
| #define A6 $xr9 | |||
| #define A7 $xr10 | |||
| #define A8 $xr11 | |||
| #define A9 $xr12 | |||
| #define A10 $xr13 | |||
| #define A11 $xr14 | |||
| #define A12 $xr15 | |||
| #define A13 $xr16 | |||
| #define A14 $xr17 | |||
| #define A15 $xr18 | |||
| #define TP0 $xr19 | |||
| #define TP1 $xr20 | |||
| #define TP2 $xr21 | |||
| #define TP3 $xr22 | |||
| #define TP4 $xr23 | |||
| #define TP5 $xr24 | |||
| #define TP6 $xr25 | |||
| #define TP7 $xr26 | |||
| #define Y0 $xr3 | |||
| #define Y1 $xr4 | |||
| #define Y2 $xr5 | |||
| #define Y3 $xr6 | |||
| #define Y4 $xr7 | |||
| #define Y5 $xr8 | |||
| #define Y6 $xr9 | |||
| #define Y7 $xr10 | |||
| .macro ZERO_Y8 | |||
| GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ | |||
| TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 | |||
| .endm | |||
| .macro ZERO_Y4 | |||
| GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 | |||
| .endm | |||
| .macro ZERO_Y2 | |||
| GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 | |||
| .endm | |||
| .macro ZERO_Y1 | |||
| GXOR xv, v, TP0, TP0, TP0 | |||
| .endm | |||
| .macro DLOAD_X8 | |||
| GLD xv, , X0, X, 0x00, X1, X, 0x20 | |||
| .endm | |||
| .macro DLOAD_X4 | |||
| GLD xv, , X0, X, 0x00 | |||
| .endm | |||
| .macro DLOAD_X8_GAP | |||
| fld.d $f1, X, 0x00 | |||
| fldx.d $f2, X, INC_X | |||
| PTR_ALSL T0, INC_X, X, 1 | |||
| fld.d $f3, T0, 0x00 | |||
| fldx.d $f4, T0, INC_X | |||
| GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 | |||
| PTR_ALSL T0, INC_X, X, 2 | |||
| fld.d $f2, T0, 0x00 | |||
| fldx.d $f3, T0, INC_X | |||
| PTR_ALSL T0, INC_X, T0, 1 | |||
| fld.d $f4, T0, 0x00 | |||
| fldx.d $f5, T0, INC_X | |||
| GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 | |||
| .endm | |||
| .macro DLOAD_X4_GAP | |||
| fld.d $f1, X, 0x00 | |||
| fldx.d $f2, X, INC_X | |||
| PTR_ALSL T0, INC_X, X, 1 | |||
| fld.d $f3, T0, 0x00 | |||
| fldx.d $f4, T0, INC_X | |||
| GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 | |||
| .endm | |||
| .macro DGEMV_T_8x8 | |||
| GLD_INC xv, , 0x20, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0, \ | |||
| A4, PA2, 0, A5, PA2, 0, \ | |||
| A6, PA3, 0, A7, PA3, 0, \ | |||
| A8, PA4, 0, A9, PA4, 0, \ | |||
| A10, PA5, 0, A11, PA5, 0, \ | |||
| A12, PA6, 0, A13, PA6, 0, \ | |||
| A14, PA7, 0, A15, PA7, 0 | |||
| GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ | |||
| TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ | |||
| TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ | |||
| TP3, A6, X0, TP3, TP3, A7, X1, TP3, \ | |||
| TP4, A8, X0, TP4, TP4, A9, X1, TP4, \ | |||
| TP5, A10, X0, TP5, TP5, A11, X1, TP5, \ | |||
| TP6, A12, X0, TP6, TP6, A13, X1, TP6, \ | |||
| TP7, A14, X0, TP7, TP7, A15, X1, TP7 | |||
| .endm | |||
| .macro DGEMV_T_8x4 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \ | |||
| A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0 | |||
| GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ | |||
| TP2, A4, X0, TP2, TP3, A6, X0, TP3, \ | |||
| TP4, A8, X0, TP4, TP5, A10, X0, TP5, \ | |||
| TP6, A12, X0, TP6, TP7, A14, X0, TP7, | |||
| .endm | |||
| .macro DGEMV_T_4x8 | |||
| GLD_INC xv, , 0x20, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0, \ | |||
| A4, PA2, 0, A5, PA2, 0, \ | |||
| A6, PA3, 0, A7, PA3, 0 | |||
| GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ | |||
| TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ | |||
| TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ | |||
| TP3, A6, X0, TP3, TP3, A7, X1, TP3 | |||
| .endm | |||
| .macro DGEMV_T_4x4 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 | |||
| GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ | |||
| TP2, A4, X0, TP2, TP3, A6, X0, TP3 | |||
| .endm | |||
| .macro DGEMV_T_2x8 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0 | |||
| GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ | |||
| TP1, A2, X0, TP1, TP1, A3, X1, TP1 | |||
| .endm | |||
| .macro DGEMV_T_2x4 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 | |||
| GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1 | |||
| .endm | |||
| .macro DGEMV_T XW:req X8:req, X4:req | |||
| PTR_SRLI J, N, 3 | |||
| beqz J, .L_\XW\()_N_7 | |||
| PTR_SLLI K_LDA, LDA, 3 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| .L_\XW\()_N_L8: | |||
| ZERO_Y8 | |||
| move X, X_ORG | |||
| PTR_SRLI I, M, 3 | |||
| beqz I, .L_\XW\()_M_7 | |||
| .align 5 | |||
| .L_\XW\()_M_L8: | |||
| DLOAD_\X8 | |||
| DGEMV_T_8x8 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL X, INC_X, X, 3 | |||
| bnez I, .L_\XW\()_M_L8 | |||
| .L_\XW\()_M_7: | |||
| andi I, M, 4 | |||
| beqz I, .L_\XW\()_M_3 | |||
| DLOAD_\X4 | |||
| DGEMV_T_8x4 | |||
| PTR_ALSL X, INC_X, X, 2 | |||
| .L_\XW\()_M_3: | |||
| // Accumulated | |||
| GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ | |||
| Y5, TP5, Y6, TP6, Y7, TP7 | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_M_END | |||
| .align 5 | |||
| .L_\XW\()_M_L1: | |||
| fld.d $f1, X, 0x00 | |||
| fld.d $f11, PA0, 0x00 | |||
| fld.d $f12, PA1, 0x00 | |||
| fld.d $f13, PA2, 0x00 | |||
| fld.d $f14, PA3, 0x00 | |||
| fld.d $f15, PA4, 0x00 | |||
| fld.d $f16, PA5, 0x00 | |||
| fld.d $f17, PA6, 0x00 | |||
| fld.d $f18, PA7, 0x00 | |||
| #if __loongarch_grlen == 64 | |||
| GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ | |||
| PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 | |||
| #else | |||
| GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ | |||
| PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 | |||
| #endif | |||
| GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \ | |||
| $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| bnez I, .L_\XW\()_M_L1 | |||
| .L_\XW\()_M_END: | |||
| fld.d $f11, Y, 0x00 | |||
| fldx.d $f12, Y, INC_Y | |||
| PTR_ALSL PY0, INC_Y, Y, 1 | |||
| fld.d $f13, PY0, 0x00 | |||
| fldx.d $f14, PY0, INC_Y | |||
| PTR_ALSL PY1, INC_Y, Y, 2 | |||
| fld.d $f15, PY1, 0x00 | |||
| fldx.d $f16, PY1, INC_Y | |||
| PTR_ALSL PY2, INC_Y, PY1, 1 | |||
| fld.d $f17, PY2, 0x00 | |||
| fldx.d $f18, PY2, INC_Y | |||
| GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \ | |||
| $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18 | |||
| PTR_ADDI J, J, -1 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
| PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
| #else | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
| PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
| #endif | |||
| fst.d $f11, Y, 0x00 | |||
| fstx.d $f12, Y, INC_Y | |||
| fst.d $f13, PY0, 0x00 | |||
| fstx.d $f14, PY0, INC_Y | |||
| fst.d $f15, PY1, 0x00 | |||
| fstx.d $f16, PY1, INC_Y | |||
| fst.d $f17, PY2, 0x00 | |||
| fstx.d $f18, PY2, INC_Y | |||
| PTR_ALSL Y, INC_Y, Y, 3 | |||
| bnez J, .L_\XW\()_N_L8 | |||
| .L_\XW\()_N_7: | |||
| andi J, N, 4 | |||
| beqz J, .L_\XW\()_N_3 | |||
| ZERO_Y4 | |||
| move X, X_ORG | |||
| PTR_SRLI I, M, 3 | |||
| beqz I, .L_\XW\()_N_4_M_7 | |||
| .align 5 | |||
| .L_\XW\()_N_4_M_L8: | |||
| DLOAD_\X8 | |||
| DGEMV_T_4x8 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL X, INC_X, X, 3 | |||
| bnez I, .L_\XW\()_N_4_M_L8 | |||
| .L_\XW\()_N_4_M_7: | |||
| andi I, M, 4 | |||
| beqz I, .L_\XW\()_N_4_M_3 | |||
| DLOAD_\X4 | |||
| DGEMV_T_4x4 | |||
| PTR_ALSL X, INC_X, X, 2 | |||
| .L_\XW\()_N_4_M_3: | |||
| // Accumulated | |||
| GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_N_4_M_END | |||
| .align 5 | |||
| .L_\XW\()_N_4_M_L1: | |||
| fld.d $f1, X, 0x00 | |||
| GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00 | |||
| GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| bnez I, .L_\XW\()_N_4_M_L1 | |||
| .L_\XW\()_N_4_M_END: | |||
| fld.d $f11, Y, 0x00 | |||
| fldx.d $f12, Y, INC_Y | |||
| PTR_ALSL PY0, INC_Y, Y, 1 | |||
| fld.d $f13, PY0, 0x00 | |||
| fldx.d $f14, PY0, INC_Y | |||
| GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14 | |||
| PTR_SLLI K_LDA, LDA, 2 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #else | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #endif | |||
| fst.d $f11, Y, 0x00 | |||
| fstx.d $f12, Y, INC_Y | |||
| fst.d $f13, PY0, 0x00 | |||
| fstx.d $f14, PY0, INC_Y | |||
| PTR_ALSL Y, INC_Y, Y, 2 | |||
| .L_\XW\()_N_3: | |||
| andi J, N, 2 | |||
| beqz J, .L_\XW\()_N_1 | |||
| ZERO_Y2 | |||
| move X, X_ORG | |||
| PTR_SRLI I, M, 3 | |||
| beqz I, .L_\XW\()_N_2_M_7 | |||
| .align 5 | |||
| .L_\XW\()_N_2_M_L8: | |||
| DLOAD_\X8 | |||
| DGEMV_T_2x8 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL X, INC_X, X, 3 | |||
| bnez I, .L_\XW\()_N_2_M_L8 | |||
| .L_\XW\()_N_2_M_7: | |||
| andi I, M, 4 | |||
| beqz I, .L_\XW\()_N_2_M_3 | |||
| DLOAD_\X4 | |||
| DGEMV_T_2x4 | |||
| PTR_ALSL X, INC_X, X, 2 | |||
| .L_\XW\()_N_2_M_3: | |||
| // Accumulated | |||
| GACC xvf, d, Y0, TP0, Y1, TP1 | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_N_2_M_END | |||
| .align 5 | |||
| .L_\XW\()_N_2_M_L1: | |||
| fld.d $f1, X, 0x00 | |||
| GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00 | |||
| GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| bnez I, .L_\XW\()_N_2_M_L1 | |||
| .L_\XW\()_N_2_M_END: | |||
| fld.d $f11, Y, 0x00 | |||
| fldx.d $f12, Y, INC_Y | |||
| GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12 | |||
| PTR_SLLI K_LDA, LDA, 1 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #else | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #endif | |||
| fst.d $f11, Y, 0x00 | |||
| fstx.d $f12, Y, INC_Y | |||
| PTR_ALSL Y, INC_Y, Y, 1 | |||
| .L_\XW\()_N_1: | |||
| andi J, N, 1 | |||
| beqz J, .L_END | |||
| ZERO_Y1 | |||
| move X, X_ORG | |||
| move I, M | |||
| beqz I, .L_END | |||
| .align 5 | |||
| .L_\XW\()_N_1_M_L1: | |||
| fld.d $f3, PA0, 0x00 | |||
| fld.d $f1, X, 0x00 | |||
| fmadd.d $f19, $f3, $f1, $f19 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| PTR_ADDI PA0, PA0, 0x08 | |||
| bnez I, .L_\XW\()_N_1_M_L1 | |||
| fld.d $f3, Y, 0x00 | |||
| fmadd.d $f3, ALPHA, $f19, $f3 | |||
| fst.d $f3, Y, 0x00 | |||
| b .L_END | |||
| .endm | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 8, 24 + 3 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||
| xvreplve0.d VALPHA, $xr0 | |||
| move X_ORG, X | |||
| move PA0, A | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
| PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
| #else | |||
| GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
| PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
| #endif | |||
| la.local T0, .L_GAP_TABLE | |||
| PTR_ALSL I, I, T0, 1 | |||
| ld.h K, I, 0 | |||
| PTR_ADD T0, T0, K | |||
| jirl $r0, T0, 0 | |||
| .L_GAP_TABLE: | |||
| .hword .L_GAP_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_1 - .L_GAP_TABLE | |||
| .L_GAP_0: /* if (incx == 1) */ | |||
| DGEMV_T GAP_0, X8, X4 | |||
| .L_GAP_1: /* if (incx != 1) */ | |||
| DGEMV_T GAP_1, X8_GAP, X4_GAP | |||
| .L_END: | |||
| pop_if_used 17 + 8, 24 + 3 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -1,313 +0,0 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #if __loongarch_grlen == 64 | |||
| #define LA_REG int64_t | |||
| #define REG_SIZE 8 | |||
| #define REG_LOG 3 | |||
| #define PTR_ADDI addi.d | |||
| #define PTR_ADD add.d | |||
| #define PTR_SUB sub.d | |||
| #define PTR_LD ld.d | |||
| #define PTR_ST st.d | |||
| #define PTR_SLLI slli.d | |||
| #define PTR_SRLI srli.d | |||
| #define PTR_ALSL alsl.d | |||
| #else | |||
| #define LA_REG int32_t | |||
| #define REG_SIZE 4 | |||
| #define REG_LOG 2 | |||
| #define PTR_ADDI addi.w | |||
| #define PTR_ADD add.w | |||
| #define PTR_SUB sub.w | |||
| #define PTR_LD ld.w | |||
| #define PTR_ST st.w | |||
| #define PTR_SLLI slli.w | |||
| #define PTR_SRLI srli.w | |||
| #define PTR_ALSL alsl.w | |||
| #endif | |||
| #if __loongarch_frlen == 64 | |||
| #define FREG_SIZE 8 | |||
| #define FREG_LOG 3 | |||
| #define PTR_FLD fld.d | |||
| #define PTR_FST fst.d | |||
| #else | |||
| #define FREG_SIZE 4 | |||
| #define FREG_LOG 2 | |||
| #define PTR_FLD fld.s | |||
| #define PTR_FST fst.s | |||
| #endif | |||
| // The max registers available to the user which | |||
| // do not need to be preserved across calls. | |||
| // Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html | |||
| #define MAX_INT_CALLER_SAVED 17 | |||
| #define MAX_FP_CALLER_SAVED 24 | |||
| .altmacro // Enable alternate macro mode | |||
| .macro push_if_used regs, fregs | |||
| .if \regs > MAX_INT_CALLER_SAVED | |||
| PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) | |||
| push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 | |||
| .endif | |||
| .if \fregs > MAX_FP_CALLER_SAVED | |||
| PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) | |||
| push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 | |||
| .endif | |||
| .endm // End push_if_used | |||
| .macro pop_if_used regs, fregs | |||
| .if \fregs > MAX_FP_CALLER_SAVED | |||
| pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 | |||
| PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG | |||
| .endif | |||
| .if \regs > MAX_INT_CALLER_SAVED | |||
| pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 | |||
| PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG | |||
| .endif | |||
| .endm // End pop_if_used | |||
| .macro push_regs from, to | |||
| PTR_ST $s\()\from, $sp, \from << REG_LOG | |||
| .if \to - \from | |||
| push_regs %from + 1, \to | |||
| .endif | |||
| .endm // End push_regs | |||
| .macro pop_regs from, to | |||
| PTR_LD $s\()\from, $sp, \from << REG_LOG | |||
| .if \to - \from | |||
| pop_regs %from + 1, \to | |||
| .endif | |||
| .endm // End pop_regs | |||
| .macro push_fregs from, to | |||
| PTR_FST $fs\()\from, $sp, \from << FREG_LOG | |||
| .if \to - \from | |||
| push_fregs %from + 1, \to | |||
| .endif | |||
| .endm // End push_fregs | |||
| .macro pop_fregs from, to | |||
| PTR_FLD $fs\()\from, $sp, \from << FREG_LOG | |||
| .if \to - \from | |||
| pop_fregs %from + 1, \to | |||
| .endif | |||
| .endm // End pop_fregs | |||
| // | |||
| // Instruction Related Macros | |||
| // | |||
| // GLD | |||
| // | |||
| .macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg | |||
| .ifeqs "\suf_op", "0" | |||
| \pre_op\()ld \out, \src, \offset | |||
| .else | |||
| \pre_op\()ld.\suf_op \out, \src, \offset | |||
| .endif | |||
| .ifnb \more | |||
| GLD \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GLD_INC | |||
| // | |||
| .macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg | |||
| .ifeqs "\suf_op", "0" | |||
| \pre_op\()ld \out, \src, \offset | |||
| .else | |||
| \pre_op\()ld.\suf_op \out, \src, \offset | |||
| .endif | |||
| PTR_ADDI \src, \src, \inc | |||
| .ifnb \more | |||
| GLD_INC \pre_op, \suf_op, \inc, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GLDX is same as GLD except the stride is a register | |||
| // | |||
| .macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg | |||
| .ifeqs "\suf_op", "0" | |||
| \pre_op\()ldx \out, \src, \offset | |||
| .else | |||
| \pre_op\()ldx.\suf_op \out, \src, \offset | |||
| .endif | |||
| .ifnb \more | |||
| GLDX \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GLDREPL | |||
| // | |||
| .macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg | |||
| \pre_op\()ldrepl.\suf_op \out, \src, \offset | |||
| .ifnb \more | |||
| GLDREPL \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GST | |||
| // | |||
| .macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg | |||
| .ifeqs "\suf_op", "0" | |||
| \pre_op\()st \src, \dst, \offset | |||
| .else | |||
| \pre_op\()st.\suf_op \src, \dst, \offset | |||
| .endif | |||
| .ifnb \more | |||
| GST \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GMUL | |||
| // | |||
| .macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()mul.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GMUL \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GMADD | |||
| // | |||
| .macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg | |||
| \pre_op\()madd.\suf_op \out, \in0, \in1, \in2 | |||
| .ifnb \more | |||
| GMADD \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GADD | |||
| // | |||
| .macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()add.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GADD \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GADDI | |||
| // | |||
| .macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()addi.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GADDI \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GSLLI | |||
| // | |||
| .macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()slli.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GSLLI \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GINSVE0 | |||
| // | |||
| .macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()insve0.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GINSVE0 \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GXOR | |||
| // | |||
| .macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()xor.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GXOR \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // Compound instructions | |||
| // | |||
| // GACC: Accumulate the values of vector registers | |||
| // | |||
| .macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg | |||
| .ifeqs "\pre_op", "xvf" | |||
| xvpermi.q \out, \in, 0x01 | |||
| \pre_op\()add.\suf_op \in, \out, \in | |||
| xvpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifeqs "\suf_op", "s" | |||
| xvpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .endif | |||
| .ifeqs "\pre_op", "vf" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifeqs "\suf_op", "s" | |||
| vpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .endif | |||
| .ifeqs "\pre_op", "xv" | |||
| xvpermi.q \out, \in, 0x01 | |||
| \pre_op\()add.\suf_op \in, \out, \in | |||
| xvpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "d" | |||
| xvpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "w" | |||
| xvpackod.h \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "h" | |||
| xvpackod.b \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .endif | |||
| .endif | |||
| .endif | |||
| .ifeqs "\pre_op", "v" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "d" | |||
| vpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "w" | |||
| vpackod.h \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "h" | |||
| vpackod.b \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .endif | |||
| .endif | |||
| .endif | |||
| .ifnb \more | |||
| GACC \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||