| @@ -11,9 +11,24 @@ DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMVNKERNEL = dgemv_n_8_lasx.S | |||
| DGEMVTKERNEL = dgemv_t_8_lasx.S | |||
| SGEMMKERNEL = sgemm_kernel_16x8_lasx.S | |||
| SGEMMINCOPY = sgemm_ncopy_16_lasx.S | |||
| SGEMMITCOPY = sgemm_tcopy_16_lasx.S | |||
| SGEMMONCOPY = sgemm_ncopy_8_lasx.S | |||
| SGEMMOTCOPY = sgemm_tcopy_8_lasx.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| @@ -36,6 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define PTR_ST st.d | |||
| #define PTR_SLLI slli.d | |||
| #define PTR_SRLI srli.d | |||
| #define PTR_SRAI srai.d | |||
| #define PTR_MUL mul.d | |||
| #define PTR_ALSL alsl.d | |||
| #else | |||
| #define LA_REG int32_t | |||
| @@ -48,6 +50,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define PTR_ST st.w | |||
| #define PTR_SLLI slli.w | |||
| #define PTR_SRLI srli.w | |||
| #define PTR_SRAI srai.w | |||
| #define PTR_MUL mul.w | |||
| #define PTR_ALSL alsl.w | |||
| #endif | |||
| @@ -218,6 +222,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endif | |||
| .endm | |||
| // | |||
| // GSUB | |||
| // | |||
| .macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()sub.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GSUB \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GSLLI | |||
| // | |||
| .macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| @@ -244,6 +257,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| GXOR \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GPERMI | |||
| // | |||
| .macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()permi.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GPERMI \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GNMSUB | |||
| // | |||
| .macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg | |||
| \pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2 | |||
| .ifnb \more | |||
| GNMSUB \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GPRELD | |||
| // | |||
| .macro GPRELD in0:req, in1:req, in2:req, more:vararg | |||
| preld \in0, \in1, \in2 | |||
| .ifnb \more | |||
| GPRELD \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // Compound instructions | |||
| @@ -311,3 +351,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| GACC \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GMOV | |||
| // | |||
| .macro GMOV pre_op:req, out:req, in:req, more:vararg | |||
| \pre_op\()or.v \out, \in, \in | |||
| .ifnb \more | |||
| GMOV \pre_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // Media Related Macros | |||
| // | |||
| .macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1 | |||
| \pre_op\()ilvl.\suf_op \out0, \in0, \in1 | |||
| \pre_op\()ilvh.\suf_op \out1, \in0, \in1 | |||
| .endm | |||
| .macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1 | |||
| \pre_op\()pickev.\suf_op \out0, \in0, \in1 | |||
| \pre_op\()pickod.\suf_op \out1, \in0, \in1 | |||
| .endm | |||
| // | |||
| // TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors, | |||
| // has no pre_op param. 128-bit vector instructions are not supported. | |||
| // | |||
| .macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ | |||
| vt0, vt1 | |||
| GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0 | |||
| GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2 | |||
| GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3 | |||
| GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02 | |||
| .endm | |||
| .macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \ | |||
| in0, in1, in2, in3, in4, in5, in6, in7, \ | |||
| tmp0, tmp1, tmp2, tmp3 | |||
| GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0 | |||
| GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1 | |||
| GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0 | |||
| GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2 | |||
| GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4 | |||
| GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5 | |||
| GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0 | |||
| GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2 | |||
| GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3 | |||
| GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \ | |||
| \out2, \out6, 0x02, \out3, \out7, 0x02, \ | |||
| \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \ | |||
| \out6, \tmp2, 0x31, \out7, \tmp3, 0x31 | |||
| .endm | |||
| @@ -0,0 +1,463 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /********************************************************************* | |||
| * 2023/08/23 guxiwei | |||
| * UTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| *********************************************************************/ | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define S9 $r20 | |||
| #define S10 $r23 | |||
| #define S11 $r24 | |||
| #define S12 $r25 | |||
| #define S13 $r26 | |||
| #define S14 $r27 | |||
| #define S15 $r28 | |||
| #define S16 $r29 | |||
| #define TD $r30 | |||
| #define TS $r31 | |||
| #define TL $r7 | |||
| #define T0 $r6 | |||
| #undef ZERO | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| #define U8 $xr8 | |||
| #define U9 $xr9 | |||
| #define U10 $xr10 | |||
| #define U11 $xr11 | |||
| #define U12 $xr12 | |||
| #define U13 $xr13 | |||
| #define U14 $xr14 | |||
| #define U15 $xr15 | |||
| #define D0 $xr16 | |||
| #define D1 $xr17 | |||
| #define D2 $xr18 | |||
| #define D3 $xr19 | |||
| #define D4 $xr20 | |||
| #define D5 $xr21 | |||
| #define D6 $xr22 | |||
| #define D7 $xr23 | |||
| #define D8 $xr24 | |||
| #define D9 $xr25 | |||
| #define D10 $xr26 | |||
| #define D11 $xr27 | |||
| #define D12 $xr28 | |||
| #define D13 $xr29 | |||
| #define D14 $xr30 | |||
| #define D15 $xr31 | |||
| // Loops outline | |||
| //.L_N16 <------------------- | |||
| //| .L_M8: | | |||
| //| .L_M7: | Main Loop | |||
| //| .L_M1: | | |||
| //| .L_M0: --------------- | |||
| //.L_N15: | |||
| //.L_N8: | |||
| //| .L_N8_M8: | |||
| //| .L_N8_M7: | |||
| //| .L_N8_M1: | |||
| //.L_N7: | |||
| //.L_N4: | |||
| //| .L_N4_M4: | |||
| //| .L_N4_M3: | |||
| //| .L_N4_M1: | |||
| //.L_N3: | |||
| //.L_N2: | |||
| //| .L_N2_M2: | |||
| //| .L_N2_M1: | |||
| //.L_N1: | |||
| //| .L_N1_M1: | |||
| //.L_N0 | |||
| PROLOGUE | |||
| push_if_used 26, 32 | |||
| move TD, DST | |||
| move TS, SRC | |||
| PTR_SLLI TL, LDA, 0x02 | |||
| PTR_SLLI T0, TL, 0x01 | |||
| PTR_SRAI J, N, 0x04 | |||
| beq J, ZERO, .L_N15 | |||
| .align 5 | |||
| .L_N16: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x03 | |||
| PTR_ADD S3, S2, TL | |||
| PTR_ADDI J, J, -1 | |||
| PTR_ADD S4, S3, TL | |||
| PTR_ADD S5, S3, T0 | |||
| PTR_ADD S6, S4, T0 | |||
| PTR_ADD S7, S5, T0 | |||
| PTR_ADD S8, S6, T0 | |||
| PTR_ADD S9, S7, T0 | |||
| PTR_ADD S10, S8, T0 | |||
| PTR_ADD S11, S9, T0 | |||
| PTR_ADD S12, S10, T0 | |||
| PTR_ADD S13, S11, T0 | |||
| PTR_ADD S14, S12, T0 | |||
| PTR_ADD S15, S13, T0 | |||
| PTR_ADD S16, S14, T0 | |||
| PTR_ADD TS, S15, T0 | |||
| beq I, ZERO, .L_M7 | |||
| .align 5 | |||
| .L_M8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| xvld U8, S9, 0x00 | |||
| xvld U9, S10, 0x00 | |||
| xvld U10, S11, 0x00 | |||
| xvld U11, S12, 0x00 | |||
| xvld U12, S13, 0x00 | |||
| xvld U13, S14, 0x00 | |||
| xvld U14, S15, 0x00 | |||
| xvld U15, S16, 0x00 | |||
| GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ | |||
| U0, U1, U2, U3, U4, U5, U6, U7, \ | |||
| D1, D3, D5, D7 // As tmp | |||
| GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \ | |||
| U8, U9, U10, U11, U12, U13, U14, U15, \ | |||
| U0, U1, U2, U3 // As tmp | |||
| GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \ | |||
| D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0 | |||
| PTR_ADDI TD, TD, 0x100 | |||
| GST xv, , D8, TD, 0x00, D9, TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \ | |||
| D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0 | |||
| PTR_ADDI TD, TD, 0x100 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI S5, S5, 0x20 | |||
| PTR_ADDI S6, S6, 0x20 | |||
| PTR_ADDI S7, S7, 0x20 | |||
| PTR_ADDI S8, S8, 0x20 | |||
| PTR_ADDI S9, S9, 0x20 | |||
| PTR_ADDI S10, S10, 0x20 | |||
| PTR_ADDI S11, S11, 0x20 | |||
| PTR_ADDI S12, S12, 0x20 | |||
| PTR_ADDI S13, S13, 0x20 | |||
| PTR_ADDI S14, S14, 0x20 | |||
| PTR_ADDI S15, S15, 0x20 | |||
| PTR_ADDI S16, S16, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_M8 | |||
| .L_M7: | |||
| andi I, M, 0x07 | |||
| beq I, ZERO, .L_M0 | |||
| .align 5 | |||
| .L_M1: | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fld.s F2, S3, 0x00 | |||
| fld.s F3, S4, 0x00 | |||
| fld.s F4, S5, 0x00 | |||
| fld.s F5, S6, 0x00 | |||
| fld.s F6, S7, 0x00 | |||
| fld.s F7, S8, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| fst.s F3, TD, 0x0C | |||
| fst.s F4, TD, 0x10 | |||
| fst.s F5, TD, 0x14 | |||
| fst.s F6, TD, 0x18 | |||
| fst.s F7, TD, 0x1C | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI S5, S5, 0x04 | |||
| PTR_ADDI S6, S6, 0x04 | |||
| PTR_ADDI S7, S7, 0x04 | |||
| PTR_ADDI S8, S8, 0x04 | |||
| PTR_ADDI TD, TD, 0x20 | |||
| fld.s F0, S9, 0x00 | |||
| fld.s F1, S10, 0x00 | |||
| fld.s F2, S11, 0x00 | |||
| fld.s F3, S12, 0x00 | |||
| fld.s F4, S13, 0x00 | |||
| fld.s F5, S14, 0x00 | |||
| fld.s F6, S15, 0x00 | |||
| fld.s F7, S16, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| fst.s F3, TD, 0x0C | |||
| fst.s F4, TD, 0x10 | |||
| fst.s F5, TD, 0x14 | |||
| fst.s F6, TD, 0x18 | |||
| fst.s F7, TD, 0x1C | |||
| PTR_ADDI S9, S9, 0x04 | |||
| PTR_ADDI S10, S10, 0x04 | |||
| PTR_ADDI S11, S11, 0x04 | |||
| PTR_ADDI S12, S12, 0x04 | |||
| PTR_ADDI S13, S13, 0x04 | |||
| PTR_ADDI S14, S14, 0x04 | |||
| PTR_ADDI S15, S15, 0x04 | |||
| PTR_ADDI S16, S16, 0x04 | |||
| PTR_ADDI TD, TD, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_M1 | |||
| .L_M0: | |||
| blt ZERO, J, .L_N16 | |||
| .L_N15: | |||
| andi J, N, 0x0f | |||
| beq ZERO, J, .L_N0 | |||
| andi J, N, 0x08 | |||
| beq ZERO, J, .L_N7 | |||
| .L_N8: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x03 | |||
| PTR_ADD S3, S2, TL | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD S5, S3, T0 | |||
| PTR_ADD S6, S4, T0 | |||
| PTR_ADD S7, S5, T0 | |||
| PTR_ADD S8, S6, T0 | |||
| PTR_ADD TS, S7, T0 | |||
| beq I, ZERO, .L_N8_M7 | |||
| .align 5 | |||
| .L_N8_M8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ | |||
| U0, U1, U2, U3, U4, U5, U6, U7, \ | |||
| D1, D3, D5, D7 // As tmp | |||
| GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ | |||
| D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 | |||
| PTR_ADDI TD, TD, 0x100 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI S5, S5, 0x20 | |||
| PTR_ADDI S6, S6, 0x20 | |||
| PTR_ADDI S7, S7, 0x20 | |||
| PTR_ADDI S8, S8, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N8_M8 | |||
| .L_N8_M7: | |||
| andi I, M, 0x07 | |||
| beq I, ZERO, .L_N7 | |||
| .align 5 | |||
| .L_N8_M1: | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fld.s F2, S3, 0x00 | |||
| fld.s F3, S4, 0x00 | |||
| fld.s F4, S5, 0x00 | |||
| fld.s F5, S6, 0x00 | |||
| fld.s F6, S7, 0x00 | |||
| fld.s F7, S8, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F1, TD, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| fst.s F3, TD, 0x0C | |||
| PTR_ADDI S4, S4, 0x04 | |||
| fst.s F4, TD, 0x10 | |||
| PTR_ADDI S5, S5, 0x04 | |||
| fst.s F5, TD, 0x14 | |||
| PTR_ADDI S6, S6, 0x04 | |||
| fst.s F6, TD, 0x18 | |||
| PTR_ADDI S7, S7, 0x04 | |||
| fst.s F7, TD, 0x1C | |||
| PTR_ADDI S8, S8, 0x04 | |||
| PTR_ADDI TD, TD, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N8_M1 | |||
| .L_N7: | |||
| andi J, N, 0x07 | |||
| beq ZERO, J, .L_N0 | |||
| andi J, N, 0x04 | |||
| beq ZERO, J, .L_N3 | |||
| .L_N4: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x02 | |||
| PTR_ADD S3, S2, TL | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD TS, S3, T0 | |||
| beq I, ZERO, .L_N4_M3 | |||
| .align 5 | |||
| .L_N4_M4: | |||
| GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 | |||
| GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 | |||
| GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 | |||
| GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 | |||
| GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 | |||
| GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI S3, S3, 0x10 | |||
| PTR_ADDI S4, S4, 0x10 | |||
| PTR_ADDI TD, TD, 0x40 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N4_M4 | |||
| .L_N4_M3: | |||
| andi I, M, 0x03 | |||
| beq I, ZERO, .L_N3 | |||
| .align 5 | |||
| .L_N4_M1: | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fld.s F2, S3, 0x00 | |||
| fld.s F3, S4, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F1, TD, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| fst.s F3, TD, 0x0C | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI TD, TD, 0x10 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N4_M1 | |||
| .L_N3: | |||
| andi J, N, 0x03 | |||
| beq ZERO, J, .L_N0 | |||
| andi J, N, 0x02 | |||
| beq ZERO, J, .L_N1 | |||
| .L_N2: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x01 | |||
| PTR_ADD TS, S2, TL | |||
| beq I, ZERO, .L_N2_M1 | |||
| .align 5 | |||
| .L_N2_M2: | |||
| GLD f, d, F0, S1, 0x00, F1, S2, 0x00 | |||
| vilvl.w $vr0, $vr1, $vr0 | |||
| GST v, , $vr0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI TD, TD, 0x10 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N2_M2 | |||
| .L_N2_M1: | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N1 | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F1, TD, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI TD, TD, 0x08 | |||
| .align 5 | |||
| .L_N1: | |||
| move S1, TS | |||
| beq ZERO, M, .L_N0 | |||
| .L_N1_M1: | |||
| fld.s F0, S1, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI TD, TD, 0x04 | |||
| PTR_ADDI M, M, -1 | |||
| blt ZERO, M, .L_N1_M1 | |||
| .L_N0: | |||
| pop_if_used 26, 32 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -0,0 +1,298 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /********************************************************************* | |||
| * 2023/08/23 guxiwei | |||
| * UTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| *********************************************************************/ | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define TD $r20 | |||
| #define TS $r11 | |||
| #define TL $r7 | |||
| #define T0 $r6 | |||
| #undef ZERO | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| #define D0 $xr8 | |||
| #define D1 $xr9 | |||
| #define D2 $xr10 | |||
| #define D3 $xr11 | |||
| #define D4 $xr12 | |||
| #define D5 $xr13 | |||
| #define D6 $xr14 | |||
| #define D7 $xr15 | |||
| #define D8 $xr16 | |||
| #define D10 $xr17 | |||
| #define D12 $xr18 | |||
| #define D14 $xr19 | |||
| // Loops outline | |||
| //.L_N8: <---------------- | |||
| //| .L_M8: | | |||
| //| .L_M7: | Main Loop | |||
| //| .L_M1: | | |||
| //| .L_M0:-------------- | |||
| //.L_N7: | |||
| //.L_N4: | |||
| //| .L_N4_M4: | |||
| //| .L_N4_M3: | |||
| //| .L_N4_M1: | |||
| //.L_N3: | |||
| //.L_N2: | |||
| //| .L_N2_M2: | |||
| //| .L_N2_M1: | |||
| //.L_N1: | |||
| //| .L_N1_M1: | |||
| //.L_N0 | |||
| PROLOGUE | |||
| push_if_used 17, 20 | |||
| move TD, DST | |||
| move TS, SRC | |||
| PTR_SLLI TL, LDA, 0x02 | |||
| PTR_SLLI T0, TL, 0x01 | |||
| PTR_SRAI J, N, 0x03 | |||
| beq J, ZERO, .L_N7 | |||
| .align 5 | |||
| .L_N8: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x03 | |||
| PTR_ADD S3, S2, TL | |||
| PTR_ADDI J, J, -1 | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD S5, S3, T0 | |||
| PTR_ADD S6, S4, T0 | |||
| PTR_ADD S7, S5, T0 | |||
| PTR_ADD S8, S6, T0 | |||
| PTR_ADD TS, S7, T0 | |||
| beq I, ZERO, .L_M7 | |||
| .align 5 | |||
| .L_M8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ | |||
| U0, U1, U2, U3, U4, U5, U6, U7, \ | |||
| D1, D3, D5, D7 // As tmp | |||
| GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ | |||
| D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 | |||
| PTR_ADDI TD, TD, 0x100 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI S5, S5, 0x20 | |||
| PTR_ADDI S6, S6, 0x20 | |||
| PTR_ADDI S7, S7, 0x20 | |||
| PTR_ADDI S8, S8, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_M8 | |||
| .L_M7: | |||
| andi I, M, 0x07 | |||
| beq I, ZERO, .L_M0 | |||
| .align 5 | |||
| .L_M1: | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fld.s F2, S3, 0x00 | |||
| fld.s F3, S4, 0x00 | |||
| fld.s F4, S5, 0x00 | |||
| fld.s F5, S6, 0x00 | |||
| fld.s F6, S7, 0x00 | |||
| fld.s F7, S8, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F1, TD, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| fst.s F3, TD, 0x0C | |||
| PTR_ADDI S4, S4, 0x04 | |||
| fst.s F4, TD, 0x10 | |||
| PTR_ADDI S5, S5, 0x04 | |||
| fst.s F5, TD, 0x14 | |||
| PTR_ADDI S6, S6, 0x04 | |||
| fst.s F6, TD, 0x18 | |||
| PTR_ADDI S7, S7, 0x04 | |||
| fst.s F7, TD, 0x1C | |||
| PTR_ADDI S8, S8, 0x04 | |||
| PTR_ADDI TD, TD, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_M1 | |||
| .L_M0: | |||
| blt ZERO, J, .L_N8 | |||
| .L_N7: | |||
| andi J, N, 0x07 | |||
| beq ZERO, J, .L_N0 | |||
| andi J, N, 0x04 | |||
| beq ZERO, J, .L_N3 | |||
| .L_N4: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x02 | |||
| PTR_ADD S3, S2, TL | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD TS, S3, T0 | |||
| beq I, ZERO, .L_N4_M3 | |||
| .align 5 | |||
| .L_N4_M4: | |||
| GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 | |||
| GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 | |||
| GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 | |||
| GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 | |||
| GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 | |||
| GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI S3, S3, 0x10 | |||
| PTR_ADDI S4, S4, 0x10 | |||
| PTR_ADDI TD, TD, 0x40 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N4_M4 | |||
| .L_N4_M3: | |||
| andi I, M, 0x03 | |||
| beq I, ZERO, .L_N3 | |||
| .align 5 | |||
| .L_N4_M1: | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fld.s F2, S3, 0x00 | |||
| fld.s F3, S4, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F1, TD, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| fst.s F3, TD, 0x0C | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI TD, TD, 0x10 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N4_M1 | |||
| .L_N3: | |||
| andi J, N, 0x03 | |||
| beq ZERO, J, .L_N0 | |||
| andi J, N, 0x02 | |||
| beq ZERO, J, .L_N1 | |||
| .L_N2: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x01 | |||
| PTR_ADD TS, S2, TL | |||
| beq I, ZERO, .L_N2_M1 | |||
| .align 5 | |||
| .L_N2_M2: | |||
| GLD f, d, F0, S1, 0x00, F1, S2, 0x00 | |||
| vilvl.w $vr0, $vr1, $vr0 | |||
| GST v, , $vr0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI TD, TD, 0x10 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N2_M2 | |||
| .L_N2_M1: | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N1 | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F1, TD, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI TD, TD, 0x08 | |||
| .align 5 | |||
| .L_N1: | |||
| move S1, TS | |||
| beq ZERO, M, .L_N0 | |||
| .L_N1_M1: | |||
| fld.s F0, S1, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI TD, TD, 0x04 | |||
| PTR_ADDI M, M, -1 | |||
| blt ZERO, M, .L_N1_M1 | |||
| .L_N0: | |||
| pop_if_used 17, 20 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -0,0 +1,526 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /********************************************************************* | |||
| * 2023/08/23 guxiwei | |||
| * UTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| *********************************************************************/ | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S0 $r11 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define P0 $r20 | |||
| #define P1 $r23 | |||
| #define P2 $r24 | |||
| #define P3 $r25 | |||
| #define P4 $r26 | |||
| #define P5 $r27 | |||
| #define T0 $r28 | |||
| #define T1 $r29 | |||
| #define TL $r7 | |||
| #define ZERO $r0 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| // Loops outline | |||
| //.L_M8 <------------------- | |||
| //| .L_N16: | | |||
| //| .L_N15: | | |||
| //| .L_N8: | | |||
| //| .L_N7: | Main Loop | |||
| //| .L_N4: | | |||
| //| .L_N3: | | |||
| //| .L_N2: | | |||
| //| .L_N1: | | |||
| //| .L_N0: --------------- | |||
| //.L_M7 | |||
| //.L_M4 | |||
| //| .L_M4_N16: | |||
| //| .L_M4_N15: | |||
| //| .L_M4_N8: | |||
| //| .L_M4_N7: | |||
| //| .L_M4_N4: | |||
| //| .L_M4_N3: | |||
| //| .L_M4_N2: | |||
| //| .L_M4_N1: | |||
| //.L_M3 | |||
| //.L_M2 | |||
| //| .L_M2_N16: | |||
| //| .L_M2_N15: | |||
| //| .L_M2_N8: | |||
| //| .L_M2_N7: | |||
| //| .L_M2_N4: | |||
| //| .L_M2_N3: | |||
| //| .L_M2_N2: | |||
| //| .L_M2_N1: | |||
| //.L_M1 | |||
| //| .L_M1_N16: | |||
| //| .L_M1_N15: | |||
| //| .L_M1_N8: | |||
| //| .L_M1_N7: | |||
| //| .L_M1_N4: | |||
| //| .L_M1_N3: | |||
| //| .L_M1_N2: | |||
| //| .L_M1_N1: | |||
| //.L_M0 | |||
| PROLOGUE | |||
| push_if_used 24, 8 | |||
| move S0, SRC | |||
| move P0, DST | |||
| PTR_SRAI T0, N, 0x04 | |||
| PTR_SRAI T1, N, 0x03 | |||
| PTR_SLLI T0, T0, 0x04 | |||
| PTR_SLLI T1, T1, 0x03 | |||
| PTR_MUL P2, M, T0 | |||
| PTR_MUL P3, M, T1 | |||
| PTR_SLLI P2, P2, 0x02 | |||
| PTR_SLLI P3, P3, 0x02 | |||
| PTR_ADD P2, DST, P2 | |||
| PTR_ADD P3, DST, P3 | |||
| PTR_SRAI T0, N, 0x02 | |||
| PTR_SRAI T1, N, 0x01 | |||
| PTR_SLLI T0, T0, 0x02 | |||
| PTR_SLLI T1, T1, 0x01 | |||
| PTR_MUL P4, M, T0 | |||
| PTR_MUL P5, M, T1 | |||
| PTR_SLLI P4, P4, 0x02 | |||
| PTR_SLLI P5, P5, 0x02 | |||
| PTR_ADD P4, DST, P4 | |||
| PTR_ADD P5, DST, P5 | |||
| PTR_SLLI TL, LDA, 0x02 | |||
| PTR_SRAI J, M, 0x03 | |||
| PTR_SLLI T0, TL, 0x01 | |||
| PTR_SLLI T1, M, 0x06 | |||
| beq ZERO, J, .L_M7 | |||
| .align 5 | |||
| .L_M8: | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| PTR_ADD S3, S1, T0 | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD S5, S3, T0 | |||
| PTR_ADD S6, S4, T0 | |||
| PTR_ADD S7, S5, T0 | |||
| PTR_ADD S8, S6, T0 | |||
| PTR_ADD S0, S7, T0 | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x200 | |||
| PTR_SRAI I, N, 0x04 | |||
| PTR_ADDI J, J, -1 | |||
| beq ZERO, I, .L_N15 | |||
| .L_N16: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| xvst U2, P1, 0x40 | |||
| xvst U3, P1, 0x60 | |||
| xvld U4, S3, 0x00 | |||
| xvld U5, S3, 0x20 | |||
| xvld U6, S4, 0x00 | |||
| xvld U7, S4, 0x20 | |||
| xvst U4, P1, 0x80 | |||
| xvst U5, P1, 0xA0 | |||
| xvst U6, P1, 0xC0 | |||
| xvst U7, P1, 0xE0 | |||
| xvld U0, S5, 0x00 | |||
| xvld U1, S5, 0x20 | |||
| xvld U2, S6, 0x00 | |||
| xvld U3, S6, 0x20 | |||
| xvst U0, P1, 0x100 | |||
| xvst U1, P1, 0x120 | |||
| xvst U2, P1, 0x140 | |||
| xvst U3, P1, 0x160 | |||
| xvld U4, S7, 0x00 | |||
| xvld U5, S7, 0x20 | |||
| xvld U6, S8, 0x00 | |||
| xvld U7, S8, 0x20 | |||
| xvst U4, P1, 0x180 | |||
| xvst U5, P1, 0x1A0 | |||
| xvst U6, P1, 0x1C0 | |||
| xvst U7, P1, 0x1E0 | |||
| PTR_ADDI S1, S1, 0x40 | |||
| PTR_ADDI S2, S2, 0x40 | |||
| PTR_ADDI S3, S3, 0x40 | |||
| PTR_ADDI S4, S4, 0x40 | |||
| PTR_ADDI S5, S5, 0x40 | |||
| PTR_ADDI S6, S6, 0x40 | |||
| PTR_ADDI S7, S7, 0x40 | |||
| PTR_ADDI S8, S8, 0x40 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_N16 | |||
| .L_N15: | |||
| andi I, N, 0x08 | |||
| beq ZERO, I, .L_N7 | |||
| .L_N8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \ | |||
| U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI S5, S5, 0x20 | |||
| PTR_ADDI S6, S6, 0x20 | |||
| PTR_ADDI S7, S7, 0x20 | |||
| PTR_ADDI S8, S8, 0x20 | |||
| PTR_ADDI P2, P2, 0x100 | |||
| .L_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_N3 | |||
| .L_N4: | |||
| GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ | |||
| $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 | |||
| GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \ | |||
| $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI S3, S3, 0x10 | |||
| PTR_ADDI S4, S4, 0x10 | |||
| PTR_ADDI S5, S5, 0x10 | |||
| PTR_ADDI S6, S6, 0x10 | |||
| PTR_ADDI S7, S7, 0x10 | |||
| PTR_ADDI S8, S8, 0x10 | |||
| PTR_ADDI P3, P3, 0x80 | |||
| .L_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N1 | |||
| .L_N2: | |||
| GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ | |||
| $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 | |||
| GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \ | |||
| $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI S3, S3, 0x08 | |||
| PTR_ADDI S4, S4, 0x08 | |||
| PTR_ADDI S5, S5, 0x08 | |||
| PTR_ADDI S6, S6, 0x08 | |||
| PTR_ADDI S7, S7, 0x08 | |||
| PTR_ADDI S8, S8, 0x08 | |||
| PTR_ADDI P4, P4, 0x40 | |||
| .L_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_N0 | |||
| GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ | |||
| $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 | |||
| GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \ | |||
| $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI S5, S5, 0x04 | |||
| PTR_ADDI S6, S6, 0x04 | |||
| PTR_ADDI S7, S7, 0x04 | |||
| PTR_ADDI S8, S8, 0x04 | |||
| PTR_ADDI P5, P5, 0x20 | |||
| .L_N0: | |||
| blt ZERO, J, .L_M8 | |||
| .L_M7: | |||
| andi J, M, 0x04 | |||
| beq ZERO, J, .L_M3 | |||
| .L_M4: | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| PTR_ADD S3, S1, T0 | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD S0, S3, T0 | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x100 | |||
| PTR_SRAI I, N, 0x04 | |||
| beq ZERO, I, .L_M4_N15 | |||
| .align 5 | |||
| .L_M4_N16: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| xvst U2, P1, 0x40 | |||
| xvst U3, P1, 0x60 | |||
| xvld U4, S3, 0x00 | |||
| xvld U5, S3, 0x20 | |||
| xvld U6, S4, 0x00 | |||
| xvld U7, S4, 0x20 | |||
| xvst U4, P1, 0x80 | |||
| xvst U5, P1, 0xA0 | |||
| xvst U6, P1, 0xC0 | |||
| xvst U7, P1, 0xE0 | |||
| PTR_ADDI S1, S1, 0x40 | |||
| PTR_ADDI S2, S2, 0x40 | |||
| PTR_ADDI S3, S3, 0x40 | |||
| PTR_ADDI S4, S4, 0x40 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_M4_N16 | |||
| .L_M4_N15: | |||
| andi I, N, 0x08 | |||
| beq ZERO, I, .L_M4_N7 | |||
| .L_M4_N8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI P2, P2, 0x80 | |||
| .L_M4_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_M4_N3 | |||
| .L_M4_N4: | |||
| GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 | |||
| GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI S3, S3, 0x10 | |||
| PTR_ADDI S4, S4, 0x10 | |||
| PTR_ADDI P3, P3, 0x40 | |||
| .L_M4_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M4_N1 | |||
| .L_M4_N2: | |||
| GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 | |||
| GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI S3, S3, 0x08 | |||
| PTR_ADDI S4, S4, 0x08 | |||
| PTR_ADDI P4, P4, 0x20 | |||
| .L_M4_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M3 | |||
| GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 | |||
| GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI P5, P5, 0x10 | |||
| .L_M3: | |||
| andi J, M, 0x02 | |||
| beq ZERO, J, .L_M1 | |||
| .L_M2: | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| PTR_ADD S0, S0, T0 | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x80 | |||
| PTR_SRAI I, N, 0x04 | |||
| beq ZERO, I, .L_M2_N15 | |||
| .align 5 | |||
| .L_M2_N16: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| xvst U2, P1, 0x40 | |||
| xvst U3, P1, 0x60 | |||
| PTR_ADDI S1, S1, 0x40 | |||
| PTR_ADDI S2, S2, 0x40 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_M2_N16 | |||
| .L_M2_N15: | |||
| andi I, N, 0x08 | |||
| beq ZERO, I, .L_M2_N7 | |||
| .L_M2_N8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| GST xv, , U0, P2, 0x00, U1, P2, 0x20 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI P2, P2, 0x40 | |||
| .L_M2_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_M2_N3 | |||
| .L_M2_N4: | |||
| GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 | |||
| GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI P3, P3, 0x20 | |||
| .L_M2_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M2_N1 | |||
| .L_M2_N2: | |||
| GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 | |||
| GST f, d, $f0, P4, 0x00, $f1, P4, 0x08 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI P4, P4, 0x10 | |||
| .L_M2_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M1 | |||
| GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 | |||
| GST f, s, $f0, P5, 0x00, $f1, P5, 0x04 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI P5, P5, 0x08 | |||
| .L_M1: | |||
| andi J, M, 0x01 | |||
| beq ZERO, J, .L_M0 | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x40 | |||
| PTR_SRAI I, N, 0x04 | |||
| beq ZERO, I, .L_M1_N15 | |||
| .align 5 | |||
| .L_M1_N16: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| PTR_ADDI S1, S1, 0x40 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_M1_N16 | |||
| .L_M1_N15: | |||
| andi I, N, 0x08 | |||
| beq ZERO, I, .L_M1_N7 | |||
| .L_M1_N8: | |||
| xvld U0, S1, 0x00 | |||
| GST xv, , U0, P2, 0x00 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI P2, P2, 0x20 | |||
| .L_M1_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_M1_N3 | |||
| .L_M1_N4: | |||
| GLD v, , $vr0, S1, 0x00 | |||
| GST v, , $vr0, P3, 0x00 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI P3, P3, 0x10 | |||
| .L_M1_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M1_N1 | |||
| .L_M1_N2: | |||
| GLD f, d, $f0, S1, 0x00 | |||
| GST f, d, $f0, P4, 0x00 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI P4, P4, 0x08 | |||
| .L_M1_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M0 | |||
| GLD f, s, $f0, S1, 0x00 | |||
| GST f, s, $f0, P5, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI P5, P5, 0x04 | |||
| .L_M0: | |||
| pop_if_used 24, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,406 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /********************************************************************* | |||
| * 2023/08/23 guxiwei | |||
| * UTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| *********************************************************************/ | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S0 $r11 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define P0 $r20 | |||
| #define P1 $r23 | |||
| #define P2 $r24 | |||
| #define P3 $r25 | |||
| #define P4 $r26 | |||
| #define T0 $r27 | |||
| #define T1 $r28 | |||
| #define TL $r7 | |||
| #undef ZERO | |||
| #define ZERO $r0 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| // Loops outline | |||
| //.L_M8 <------------------- | |||
| //| .L_N8: | | |||
| //| .L_N7: | Main Loop | |||
| //| .L_N4: | | |||
| //| .L_N3: | | |||
| //| .L_N2: | | |||
| //| .L_N1: | | |||
| //| .L_N0: --------------- | |||
| //.L_M7 | |||
| //.L_M4 | |||
| //| .L_M4_N8: | |||
| //| .L_M4_N7: | |||
| //| .L_M4_N4: | |||
| //| .L_M4_N3: | |||
| //| .L_M4_N2: | |||
| //| .L_M4_N1: | |||
| //.L_M3 | |||
| //.L_M2 | |||
| //| .L_M2_N8: | |||
| //| .L_M2_N7: | |||
| //| .L_M2_N4: | |||
| //| .L_M2_N3: | |||
| //| .L_M2_N2: | |||
| //| .L_M2_N1: | |||
| //.L_M1 | |||
| //| .L_M1_N8: | |||
| //| .L_M1_N7: | |||
| //| .L_M1_N4: | |||
| //| .L_M1_N3: | |||
| //| .L_M1_N2: | |||
| //| .L_M1_N1: | |||
| //.L_M0 | |||
| PROLOGUE | |||
| push_if_used 23, 8 | |||
| move S0, SRC | |||
| move P0, DST | |||
| PTR_SRAI T0, N, 0x04 | |||
| PTR_SRAI T1, N, 0x03 | |||
| PTR_SLLI T0, T0, 0x04 | |||
| PTR_SLLI T1, T1, 0x03 | |||
| PTR_MUL P2, M, T1 | |||
| PTR_SLLI P2, P2, 0x02 | |||
| PTR_ADD P2, DST, P2 | |||
| PTR_SRAI T0, N, 0x02 | |||
| PTR_SRAI T1, N, 0x01 | |||
| PTR_SLLI T0, T0, 0x02 | |||
| PTR_SLLI T1, T1, 0x01 | |||
| PTR_MUL P3, M, T0 | |||
| PTR_MUL P4, M, T1 | |||
| PTR_SLLI P3, P3, 0x02 | |||
| PTR_SLLI P4, P4, 0x02 | |||
| PTR_ADD P3, DST, P3 | |||
| PTR_ADD P4, DST, P4 | |||
| PTR_SLLI TL, LDA, 0x02 | |||
| PTR_SRAI J, M, 0x03 | |||
| PTR_SLLI T0, TL, 0x01 | |||
| PTR_SLLI T1, M, 0x05 | |||
| beq ZERO, J, .L_M7 | |||
| .align 5 | |||
| .L_M8: | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| PTR_ADD S3, S1, T0 | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD S5, S3, T0 | |||
| PTR_ADD S6, S4, T0 | |||
| PTR_ADD S7, S5, T0 | |||
| PTR_ADD S8, S6, T0 | |||
| PTR_ADD S0, S7, T0 | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x100 | |||
| PTR_SRAI I, N, 0x03 | |||
| PTR_ADDI J, J, -1 | |||
| beq ZERO, I, .L_N7 | |||
| .L_N8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \ | |||
| U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI S5, S5, 0x20 | |||
| PTR_ADDI S6, S6, 0x20 | |||
| PTR_ADDI S7, S7, 0x20 | |||
| PTR_ADDI S8, S8, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_N8 | |||
| .L_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_N3 | |||
| .L_N4: | |||
| GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ | |||
| $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 | |||
| GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \ | |||
| $vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI S3, S3, 0x10 | |||
| PTR_ADDI S4, S4, 0x10 | |||
| PTR_ADDI S5, S5, 0x10 | |||
| PTR_ADDI S6, S6, 0x10 | |||
| PTR_ADDI S7, S7, 0x10 | |||
| PTR_ADDI S8, S8, 0x10 | |||
| PTR_ADDI P2, P2, 0x80 | |||
| .L_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N1 | |||
| .L_N2: | |||
| GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ | |||
| $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 | |||
| GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \ | |||
| $f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI S3, S3, 0x08 | |||
| PTR_ADDI S4, S4, 0x08 | |||
| PTR_ADDI S5, S5, 0x08 | |||
| PTR_ADDI S6, S6, 0x08 | |||
| PTR_ADDI S7, S7, 0x08 | |||
| PTR_ADDI S8, S8, 0x08 | |||
| PTR_ADDI P3, P3, 0x40 | |||
| .L_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_N0 | |||
| GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ | |||
| $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 | |||
| GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \ | |||
| $f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI S5, S5, 0x04 | |||
| PTR_ADDI S6, S6, 0x04 | |||
| PTR_ADDI S7, S7, 0x04 | |||
| PTR_ADDI S8, S8, 0x04 | |||
| PTR_ADDI P4, P4, 0x20 | |||
| .L_N0: | |||
| blt ZERO, J, .L_M8 | |||
| .L_M7: | |||
| andi J, M, 0x04 | |||
| beq ZERO, J, .L_M3 | |||
| .L_M4: | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| PTR_ADD S3, S1, T0 | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD S0, S3, T0 | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x80 | |||
| PTR_SRAI I, N, 0x03 | |||
| beq ZERO, I, .L_M4_N7 | |||
| .align 5 | |||
| .L_M4_N8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_M4_N8 | |||
| .L_M4_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_M4_N3 | |||
| .L_M4_N4: | |||
| GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 | |||
| GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI S3, S3, 0x10 | |||
| PTR_ADDI S4, S4, 0x10 | |||
| PTR_ADDI P2, P2, 0x40 | |||
| .L_M4_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M4_N1 | |||
| .L_M4_N2: | |||
| GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 | |||
| GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI S3, S3, 0x08 | |||
| PTR_ADDI S4, S4, 0x08 | |||
| PTR_ADDI P3, P3, 0x20 | |||
| .L_M4_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M3 | |||
| GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 | |||
| GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI P4, P4, 0x10 | |||
| .L_M3: | |||
| andi J, M, 0x02 | |||
| beq ZERO, J, .L_M1 | |||
| .L_M2: | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| PTR_ADD S0, S0, T0 | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x40 | |||
| PTR_SRAI I, N, 0x03 | |||
| beq ZERO, I, .L_M2_N7 | |||
| .align 5 | |||
| .L_M2_N8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| GST xv, , U0, P1, 0x00, U1, P1, 0x20 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_M2_N8 | |||
| .L_M2_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_M2_N3 | |||
| .L_M2_N4: | |||
| GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 | |||
| GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI P2, P2, 0x20 | |||
| .L_M2_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M2_N1 | |||
| .L_M2_N2: | |||
| GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 | |||
| GST f, d, $f0, P3, 0x00, $f1, P3, 0x08 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI P3, P3, 0x10 | |||
| .L_M2_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M1 | |||
| GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 | |||
| GST f, s, $f0, P4, 0x00, $f1, P4, 0x04 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI P4, P4, 0x08 | |||
| .L_M1: | |||
| andi J, M, 0x01 | |||
| beq ZERO, J, .L_M0 | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x20 | |||
| PTR_SRAI I, N, 0x03 | |||
| beq ZERO, I, .L_M1_N7 | |||
| .align 5 | |||
| .L_M1_N8: | |||
| xvld U0, S1, 0x00 | |||
| GST xv, , U0, P1, 0x00 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_M1_N8 | |||
| .L_M1_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_M1_N3 | |||
| .L_M1_N4: | |||
| GLD v, , $vr0, S1, 0x00 | |||
| GST v, , $vr0, P2, 0x00 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI P2, P2, 0x10 | |||
| .L_M1_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M1_N1 | |||
| .L_M1_N2: | |||
| GLD f, d, $f0, S1, 0x00 | |||
| GST f, d, $f0, P3, 0x00 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI P3, P3, 0x08 | |||
| .L_M1_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M0 | |||
| GLD f, s, $f0, S1, 0x00 | |||
| GST f, s, $f0, P4, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI P4, P4, 0x04 | |||
| .L_M0: | |||
| pop_if_used 23, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -2848,34 +2848,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NO_LASX) | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #define SGEMM_DEFAULT_UNROLL_M 2 | |||
| #else | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #endif | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||
| #define SGEMM_DEFAULT_UNROLL_M 2 | |||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 1 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||
| #define SGEMM_DEFAULT_P 512 | |||
| #define SGEMM_DEFAULT_P 256 | |||
| #define DGEMM_DEFAULT_P 32 | |||
| #define CGEMM_DEFAULT_P 128 | |||
| #define ZGEMM_DEFAULT_P 128 | |||
| #define SGEMM_DEFAULT_R 12288 | |||
| #define SGEMM_DEFAULT_R 1024 | |||
| #define DGEMM_DEFAULT_R 858 | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #define SGEMM_DEFAULT_Q 128 | |||
| #define SGEMM_DEFAULT_Q 256 | |||
| #define DGEMM_DEFAULT_Q 152 | |||
| #define CGEMM_DEFAULT_Q 128 | |||
| #define ZGEMM_DEFAULT_Q 128 | |||