Signed-off-by: Hao Chen <chenhao@loongson.cn>tags/v0.3.26
| @@ -59,10 +59,10 @@ SNRM2KERNEL = snrm2_lsx.S | |||
| DNRM2KERNEL = dnrm2_lsx.S | |||
| DGEMMKERNEL = dgemm_kernel_8x4.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPY = dgemm_ncopy_8_lsx.S | |||
| DGEMMITCOPY = dgemm_tcopy_8_lsx.S | |||
| DGEMMONCOPY = dgemm_ncopy_4_lsx.S | |||
| DGEMMOTCOPY = dgemm_tcopy_4_lsx.S | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -0,0 +1,185 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define TD $r20 | |||
| #define TS $r21 | |||
| #define TL $r7 | |||
| #define T0 $r6 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LSX vectors */ | |||
| #define U0 $vr0 | |||
| #define U1 $vr1 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| #define D0 $vr8 | |||
| #define D1 $vr9 | |||
| #define D2 $vr10 | |||
| #define D3 $vr11 | |||
| #define D4 $vr12 | |||
| #define D5 $vr13 | |||
| #define D6 $vr14 | |||
| #define D7 $vr15 | |||
| PROLOGUE | |||
| move TD, DST | |||
| move TS, SRC | |||
| slli.d TL, LDA, 0x03 | |||
| slli.d T0, TL, 0x01 | |||
| srai.d J, N, 0x02 | |||
| beq J, ZERO, .L_N2 | |||
| .L_J1: /* J-- */ | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x02 | |||
| add.d S3, S2, TL | |||
| add.d S4, S2, T0 | |||
| add.d TS, S3, T0 | |||
| addi.d J, J, -1 | |||
| beq I, ZERO, .L_I3 | |||
| .L_I1: /* I-- */ | |||
| GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 | |||
| GINTERLACE v, d, D0, D2, U1, U0 | |||
| GINTERLACE v, d, D1, D3, U3, U2 | |||
| GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 | |||
| addi.d TD, TD, 0x40 | |||
| GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 | |||
| GINTERLACE v, d, D0, D2, U1, U0 | |||
| GINTERLACE v, d, D1, D3, U3, U2 | |||
| GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_I3: | |||
| andi I, M, 0x03 | |||
| beq I, ZERO, .L_I0 | |||
| .L_II1: | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| addi.d S3, S3, 0x08 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_II1 | |||
| .L_I0: | |||
| blt ZERO, J, .L_J1 | |||
| .L_N2: | |||
| andi J, N, 0x02 | |||
| beq ZERO, J, .L_N1 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x01 | |||
| add.d TS, S2, TL | |||
| beq I, ZERO, .L_2I3 | |||
| .L_2I1: /* I-- */ | |||
| GLD v, , U0, S1, 0x00, U1, S2, 0x00 | |||
| GINTERLACE v, d, D0, D1, U1, U0 | |||
| GST v, , D0, TD, 0x00, D1, TD, 0x10 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_2I1 | |||
| .L_2I3: | |||
| andi I, M, 0x01 | |||
| beq ZERO, I, .L_N1 | |||
| .L_2II1: /* I-- */ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d I, I, -1 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d TD, TD, 0x10 | |||
| blt ZERO, I, .L_2II1 | |||
| .L_N1: | |||
| move S1, TS | |||
| beq ZERO, M, .L_N0 | |||
| .L_M1: | |||
| fld.d F0, S1, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d TD, TD, 0x08 | |||
| addi.d M, M, -1 | |||
| blt ZERO, M, .L_M1 | |||
| .L_N0: | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,283 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define TD $r20 | |||
| #define TS $r21 | |||
| #define TL $r7 | |||
| #define T0 $r6 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LSX vectors */ | |||
| #define U0 $vr0 | |||
| #define U1 $vr1 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| #define D0 $vr8 | |||
| #define D1 $vr9 | |||
| #define D2 $vr10 | |||
| #define D3 $vr11 | |||
| #define D4 $vr12 | |||
| #define D5 $vr13 | |||
| #define D6 $vr14 | |||
| #define D7 $vr15 | |||
| PROLOGUE | |||
| push_if_used 26, 32 | |||
| move TD, DST | |||
| move TS, SRC | |||
| slli.d TL, LDA, 0x03 | |||
| slli.d T0, TL, 0x01 | |||
| srai.d J, N, 0x03 | |||
| beq J, ZERO, .L_N4 | |||
| .L_J1: | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x03 | |||
| add.d S3, S2, TL | |||
| addi.d J, J, -1 | |||
| add.d S4, S3, TL | |||
| add.d S5, S3, T0 | |||
| add.d S6, S4, T0 | |||
| add.d S7, S5, T0 | |||
| add.d S8, S6, T0 | |||
| add.d TS, S7, T0 | |||
| beq I, ZERO, .L_I7 | |||
| .L_I1: | |||
| GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \ | |||
| U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00 | |||
| GINTERLACE v, d, D0, D4, U1, U0 | |||
| GINTERLACE v, d, D1, D5, U3, U2 | |||
| GINTERLACE v, d, D2, D6, U5, U4 | |||
| GINTERLACE v, d, D3, D7, U7, U6 | |||
| GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ | |||
| D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 | |||
| addi.d TD, TD, 0x80 | |||
| GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \ | |||
| U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10 | |||
| GINTERLACE v, d, D0, D4, U1, U0 | |||
| GINTERLACE v, d, D1, D5, U3, U2 | |||
| GINTERLACE v, d, D2, D6, U5, U4 | |||
| GINTERLACE v, d, D3, D7, U7, U6 | |||
| GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ | |||
| D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 | |||
| addi.d TD, TD, 0x80 | |||
| GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \ | |||
| U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20 | |||
| GINTERLACE v, d, D0, D4, U1, U0 | |||
| GINTERLACE v, d, D1, D5, U3, U2 | |||
| GINTERLACE v, d, D2, D6, U5, U4 | |||
| GINTERLACE v, d, D3, D7, U7, U6 | |||
| GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ | |||
| D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 | |||
| addi.d TD, TD, 0x80 | |||
| GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \ | |||
| U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30 | |||
| GINTERLACE v, d, D0, D4, U1, U0 | |||
| GINTERLACE v, d, D1, D5, U3, U2 | |||
| GINTERLACE v, d, D2, D6, U5, U4 | |||
| GINTERLACE v, d, D3, D7, U7, U6 | |||
| GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ | |||
| D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 | |||
| addi.d TD, TD, 0x80 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| addi.d S5, S5, 0x40 | |||
| addi.d S6, S6, 0x40 | |||
| addi.d S7, S7, 0x40 | |||
| addi.d S8, S8, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_I7: | |||
| andi I, M, 0x07 | |||
| beq I, ZERO, .L_I0 | |||
| .L_II1: /* I-- */ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fld.d F4, S5, 0x00 | |||
| fld.d F5, S6, 0x00 | |||
| fld.d F6, S7, 0x00 | |||
| fld.d F7, S8, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| addi.d S3, S3, 0x08 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S4, S4, 0x08 | |||
| fst.d F4, TD, 0x20 | |||
| addi.d S5, S5, 0x08 | |||
| fst.d F5, TD, 0x28 | |||
| addi.d S6, S6, 0x08 | |||
| fst.d F6, TD, 0x30 | |||
| addi.d S7, S7, 0x08 | |||
| fst.d F7, TD, 0x38 | |||
| addi.d S8, S8, 0x08 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_II1 | |||
| .L_I0: | |||
| blt ZERO, J, .L_J1 | |||
| .L_N4: | |||
| andi J, N, 0x04 | |||
| beq ZERO, J, .L_N2 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x02 | |||
| add.d S3, S2, TL | |||
| add.d S4, S2, T0 | |||
| add.d TS, S3, T0 | |||
| beq I, ZERO, .L_I3 | |||
| .L_4I1: /* I-- */ | |||
| GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 | |||
| GINTERLACE v, d, D0, D2, U1, U0 | |||
| GINTERLACE v, d, D1, D3, U3, U2 | |||
| GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 | |||
| addi.d TD, TD, 0x40 | |||
| GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 | |||
| GINTERLACE v, d, D0, D2, U1, U0 | |||
| GINTERLACE v, d, D1, D3, U3, U2 | |||
| GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_4I1 | |||
| .L_I3: | |||
| andi I, M, 0x03 | |||
| beq I, ZERO, .L_N2 | |||
| .L_4II1: | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| addi.d S3, S3, 0x08 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_4II1 | |||
| .L_N2: | |||
| andi J, N, 0x02 | |||
| beq ZERO, J, .L_N1 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x01 | |||
| add.d TS, S2, TL | |||
| beq I, ZERO, .L_NI1 | |||
| .L_2I1: /* I-- */ | |||
| GLD v, , U0, S1, 0x00, U1, S2, 0x00 | |||
| GINTERLACE v, d, D0, D1, U1, U0 | |||
| GST v, , D0, TD, 0x00, D1, TD, 0x10 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_2I1 | |||
| .L_NI1: | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N1 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d TD, TD, 0x10 | |||
| .L_N1: | |||
| move S1, TS | |||
| beq ZERO, M, .L_N0 | |||
| .L_M1: | |||
| fld.d F0, S1, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d TD, TD, 0x08 | |||
| addi.d M, M, -1 | |||
| blt ZERO, M, .L_M1 | |||
| .L_N0: | |||
| pop_if_used 26, 32 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,280 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S0 $r11 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define P0 $r16 | |||
| #define P1 $r17 | |||
| #define P2 $r18 | |||
| #define P3 $r19 | |||
| #define T0 $r20 | |||
| #define T1 $r23 | |||
| #define TL $r7 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| /* LSX vectors */ | |||
| #define U0 $vr0 | |||
| #define U1 $vr1 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| PROLOGUE | |||
| push_if_used 18, 8 | |||
| move S0, SRC | |||
| move P0, DST | |||
| // Find P0, P2, P3 | |||
| srai.d T0, N, 0x02 | |||
| slli.d T0, T0, 0x02 | |||
| srai.d T1, N, 0x01 | |||
| slli.d T1, T1, 0x01 | |||
| mul.d T0, M, T0 | |||
| mul.d T1, M, T1 | |||
| slli.d T0, T0, 0x03 | |||
| slli.d T1, T1, 0x03 | |||
| add.d P2, DST, T0 | |||
| add.d P3, DST, T1 | |||
| slli.d TL, LDA, 0x03 | |||
| srai.d J, M, 0x02 | |||
| slli.d T0, TL, 0x01 | |||
| slli.d T1, M, 0x05 | |||
| beq ZERO, J, .L_M3 | |||
| .L_J1: /* J-- */ | |||
| move S1, S0 | |||
| add.d S2, S0, TL | |||
| add.d S3, S1, T0 | |||
| add.d S4, S2, T0 | |||
| add.d S0, S3, T0 | |||
| move P1, P0 | |||
| addi.d P0, P0, 0x80 | |||
| srai.d I, N, 0x02 | |||
| addi.d J, J, -1 | |||
| beq ZERO, I, .L_N3 | |||
| .L_I1: /* I-- */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vld U4, S3, 0x00 | |||
| vld U5, S3, 0x10 | |||
| vld U6, S4, 0x00 | |||
| vld U7, S4, 0x10 | |||
| vst U0, P1, 0x00 | |||
| vst U1, P1, 0x10 | |||
| vst U2, P1, 0x20 | |||
| vst U3, P1, 0x30 | |||
| vst U4, P1, 0x40 | |||
| vst U5, P1, 0x50 | |||
| vst U6, P1, 0x60 | |||
| vst U7, P1, 0x70 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| add.d P1, P1, T1 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N1 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vld U2, S3, 0x00 | |||
| vld U3, S4, 0x00 | |||
| vst U0, P2, 0x00 | |||
| vst U1, P2, 0x10 | |||
| vst U2, P2, 0x20 | |||
| vst U3, P2, 0x30 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d S3, S3, 0x10 | |||
| addi.d S4, S4, 0x10 | |||
| addi.d P2, P2, 0x40 | |||
| .L_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_N0 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fst.d F0, P3, 0x00 | |||
| fst.d F1, P3, 0x08 | |||
| fst.d F2, P3, 0x10 | |||
| fst.d F3, P3, 0x18 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d S3, S3, 0x08 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d P3, P3, 0x20 | |||
| .L_N0: | |||
| blt ZERO, J, .L_J1 | |||
| .L_M3: | |||
| andi J, M, 0x02 | |||
| beq ZERO, J, .L_M1 | |||
| move S1, S0 | |||
| add.d S2, S0, TL | |||
| add.d S0, S0, T0 | |||
| move P1, P0 | |||
| addi.d P0, P0, 0x40 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_2N3 | |||
| .L_2I1: /* I-- */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vst U0, P1, 0x00 | |||
| vst U1, P1, 0x10 | |||
| vst U2, P1, 0x20 | |||
| vst U3, P1, 0x30 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d I, I, -1 | |||
| add.d P1, P1, T1 | |||
| blt ZERO, I, .L_2I1 | |||
| .L_2N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_2N1 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vst U0, P2, 0x00 | |||
| vst U1, P2, 0x10 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d P2, P2, 0x20 | |||
| .L_2N1: | |||
| addi.d I, N, 0x01 | |||
| beq ZERO, I, .L_M1 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fst.d F0, P3, 0x00 | |||
| fst.d F1, P3, 0x08 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d P3, P3, 0x10 | |||
| .L_M1: | |||
| andi J, M, 0x01 | |||
| beq ZERO, J, .L_M0 | |||
| move S1, S0 | |||
| move P1, P0 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_1N3 | |||
| .L_1I1: | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vst U0, P1, 0x00 | |||
| vst U1, P1, 0x10 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d I, I, -1 | |||
| add.d P1, P1, T1 | |||
| blt ZERO, I, .L_1I1 | |||
| .L_1N3: | |||
| andi I, N, 0x02 | |||
| beq I, ZERO, .L_1N1 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S1, 0x08 | |||
| fst.d F0, P2, 0x00 | |||
| fst.d F1, P2, 0x08 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d P2, P2, 0x10 | |||
| .L_1N1: | |||
| andi I, N, 0x01 | |||
| beq I, ZERO, .L_M0 | |||
| fld.d F0, S1, 0x00 | |||
| fst.d F0, P3, 0x00 | |||
| .L_M0: | |||
| pop_if_used 18, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,597 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S0 $r11 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define P0 $r20 | |||
| #define P1 $r23 | |||
| #define P2 $r24 | |||
| #define P3 $r25 | |||
| #define P4 $r26 | |||
| #define P5 $r27 | |||
| #define T0 $r28 | |||
| #define T1 $r29 | |||
| #define TL $r7 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $vr0 | |||
| #define U1 $vr1 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| PROLOGUE | |||
| push_if_used 24, 8 | |||
| move S0, SRC | |||
| move P0, DST | |||
| srai.d T0, N, 0x03 | |||
| srai.d T1, N, 0x02 | |||
| slli.d T0, T0, 0x03 | |||
| slli.d T1, T1, 0x02 | |||
| mul.d P2, M, T0 | |||
| mul.d P3, M, T1 | |||
| slli.d P2, P2, 0x03 | |||
| slli.d P3, P3, 0x03 | |||
| add.d P2, DST, P2 | |||
| add.d P3, DST, P3 | |||
| srai.d T0, N, 0x01 | |||
| slli.d T0, T0, 0x01 | |||
| mul.d P4, M, T0 | |||
| slli.d P4, P4, 0x03 | |||
| add.d P4, DST, P4 | |||
| slli.d TL, LDA, 0x03 | |||
| srai.d J, M, 0x03 | |||
| slli.d T0, TL, 0x01 | |||
| slli.d T1, M, 0x06 | |||
| beq ZERO, J, .L_M7 | |||
| .L_J1: /* J-- */ | |||
| move S1, S0 | |||
| add.d S2, S0, TL | |||
| add.d S3, S1, T0 | |||
| add.d S4, S2, T0 | |||
| add.d S5, S3, T0 | |||
| add.d S6, S4, T0 | |||
| add.d S7, S5, T0 | |||
| add.d S8, S6, T0 | |||
| add.d S0, S7, T0 | |||
| move P1, P0 | |||
| addi.d P0, P0, 0x200 | |||
| srai.d I, N, 0x03 | |||
| addi.d J, J, -1 | |||
| beq ZERO, I, .L_N7 | |||
| .L_I1: /* I-- */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S1, 0x20 | |||
| vld U3, S1, 0x30 | |||
| vld U4, S2, 0x00 | |||
| vld U5, S2, 0x10 | |||
| vld U6, S2, 0x20 | |||
| vld U7, S2, 0x30 | |||
| vst U0, P1, 0x00 | |||
| vst U1, P1, 0x10 | |||
| vst U2, P1, 0x20 | |||
| vst U3, P1, 0x30 | |||
| vst U4, P1, 0x40 | |||
| vst U5, P1, 0x50 | |||
| vst U6, P1, 0x60 | |||
| vst U7, P1, 0x70 | |||
| vld U0, S3, 0x00 | |||
| vld U1, S3, 0x10 | |||
| vld U2, S3, 0x20 | |||
| vld U3, S3, 0x30 | |||
| vld U4, S4, 0x00 | |||
| vld U5, S4, 0x10 | |||
| vld U6, S4, 0x20 | |||
| vld U7, S4, 0x30 | |||
| vst U0, P1, 0x80 | |||
| vst U1, P1, 0x90 | |||
| vst U2, P1, 0xa0 | |||
| vst U3, P1, 0xb0 | |||
| vst U4, P1, 0xc0 | |||
| vst U5, P1, 0xd0 | |||
| vst U6, P1, 0xe0 | |||
| vst U7, P1, 0xf0 | |||
| vld U0, S5, 0x00 | |||
| vld U1, S5, 0x10 | |||
| vld U2, S5, 0x20 | |||
| vld U3, S5, 0x30 | |||
| vld U4, S6, 0x00 | |||
| vld U5, S6, 0x10 | |||
| vld U6, S6, 0x20 | |||
| vld U7, S6, 0x30 | |||
| vst U0, P1, 0x100 | |||
| vst U1, P1, 0x110 | |||
| vst U2, P1, 0x120 | |||
| vst U3, P1, 0x130 | |||
| vst U4, P1, 0x140 | |||
| vst U5, P1, 0x150 | |||
| vst U6, P1, 0x160 | |||
| vst U7, P1, 0x170 | |||
| vld U0, S7, 0x00 | |||
| vld U1, S7, 0x10 | |||
| vld U2, S7, 0x20 | |||
| vld U3, S7, 0x30 | |||
| vld U4, S8, 0x00 | |||
| vld U5, S8, 0x10 | |||
| vld U6, S8, 0x20 | |||
| vld U7, S8, 0x30 | |||
| vst U0, P1, 0x180 | |||
| vst U1, P1, 0x190 | |||
| vst U2, P1, 0x1a0 | |||
| vst U3, P1, 0x1b0 | |||
| vst U4, P1, 0x1c0 | |||
| vst U5, P1, 0x1d0 | |||
| vst U6, P1, 0x1e0 | |||
| vst U7, P1, 0x1f0 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| addi.d S5, S5, 0x40 | |||
| addi.d S6, S6, 0x40 | |||
| addi.d S7, S7, 0x40 | |||
| addi.d S8, S8, 0x40 | |||
| addi.d I, I, -1 | |||
| add.d P1, P1, T1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_N3 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vld U4, S3, 0x00 | |||
| vld U5, S3, 0x10 | |||
| vld U6, S4, 0x00 | |||
| vld U7, S4, 0x10 | |||
| vst U0, P2, 0x00 | |||
| vst U1, P2, 0x10 | |||
| vst U2, P2, 0x20 | |||
| vst U3, P2, 0x30 | |||
| vst U4, P2, 0x40 | |||
| vst U5, P2, 0x50 | |||
| vst U6, P2, 0x60 | |||
| vst U7, P2, 0x70 | |||
| vld U0, S5, 0x00 | |||
| vld U1, S5, 0x10 | |||
| vld U2, S6, 0x00 | |||
| vld U3, S6, 0x10 | |||
| vld U4, S7, 0x00 | |||
| vld U5, S7, 0x10 | |||
| vld U6, S8, 0x00 | |||
| vld U7, S8, 0x10 | |||
| vst U0, P2, 0x80 | |||
| vst U1, P2, 0x90 | |||
| vst U2, P2, 0xa0 | |||
| vst U3, P2, 0xb0 | |||
| vst U4, P2, 0xc0 | |||
| vst U5, P2, 0xd0 | |||
| vst U6, P2, 0xe0 | |||
| vst U7, P2, 0xf0 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d S5, S5, 0x20 | |||
| addi.d S6, S6, 0x20 | |||
| addi.d S7, S7, 0x20 | |||
| addi.d S8, S8, 0x20 | |||
| addi.d P2, P2, 0x100 | |||
| .L_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N1 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vld U2, S3, 0x00 | |||
| vld U3, S4, 0x00 | |||
| vld U4, S5, 0x00 | |||
| vld U5, S6, 0x00 | |||
| vld U6, S7, 0x00 | |||
| vld U7, S8, 0x00 | |||
| vst U0, P3, 0x00 | |||
| vst U1, P3, 0x10 | |||
| vst U2, P3, 0x20 | |||
| vst U3, P3, 0x30 | |||
| vst U4, P3, 0x40 | |||
| vst U5, P3, 0x50 | |||
| vst U6, P3, 0x60 | |||
| vst U7, P3, 0x70 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d S3, S3, 0x10 | |||
| addi.d S4, S4, 0x10 | |||
| addi.d S5, S5, 0x10 | |||
| addi.d S6, S6, 0x10 | |||
| addi.d S7, S7, 0x10 | |||
| addi.d S8, S8, 0x10 | |||
| addi.d P3, P3, 0x80 | |||
| .L_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_N0 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fld.d F4, S5, 0x00 | |||
| fld.d F5, S6, 0x00 | |||
| fld.d F6, S7, 0x00 | |||
| fld.d F7, S8, 0x00 | |||
| fst.d F0, P4, 0x00 | |||
| fst.d F1, P4, 0x08 | |||
| fst.d F2, P4, 0x10 | |||
| fst.d F3, P4, 0x18 | |||
| fst.d F4, P4, 0x20 | |||
| fst.d F5, P4, 0x28 | |||
| fst.d F6, P4, 0x30 | |||
| fst.d F7, P4, 0x38 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d S3, S3, 0x08 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d S5, S5, 0x08 | |||
| addi.d S6, S6, 0x08 | |||
| addi.d S7, S7, 0x08 | |||
| addi.d S8, S8, 0x08 | |||
| addi.d P4, P4, 0x40 | |||
| .L_N0: | |||
| blt ZERO, J, .L_J1 | |||
| .L_M7: | |||
| andi J, M, 0x04 | |||
| beq ZERO, J, .L_M3 | |||
| move S1, S0 | |||
| add.d S2, S0, TL | |||
| add.d S3, S1, T0 | |||
| add.d S4, S2, T0 | |||
| add.d S0, S3, T0 | |||
| move P1, P0 | |||
| addi.d P0, P0, 0x100 | |||
| srai.d I, N, 0x03 | |||
| beq ZERO, I, .L_4N7 | |||
| .L_4I1: /* I-- */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S1, 0x20 | |||
| vld U3, S1, 0x30 | |||
| vld U4, S2, 0x00 | |||
| vld U5, S2, 0x10 | |||
| vld U6, S2, 0x20 | |||
| vld U7, S2, 0x30 | |||
| vst U0, P1, 0x00 | |||
| vst U1, P1, 0x10 | |||
| vst U2, P1, 0x20 | |||
| vst U3, P1, 0x30 | |||
| vst U4, P1, 0x40 | |||
| vst U5, P1, 0x50 | |||
| vst U6, P1, 0x60 | |||
| vst U7, P1, 0x70 | |||
| vld U0, S3, 0x00 | |||
| vld U1, S3, 0x10 | |||
| vld U2, S3, 0x20 | |||
| vld U3, S3, 0x30 | |||
| vld U4, S4, 0x00 | |||
| vld U5, S4, 0x10 | |||
| vld U6, S4, 0x20 | |||
| vld U7, S4, 0x30 | |||
| vst U0, P1, 0x80 | |||
| vst U1, P1, 0x90 | |||
| vst U2, P1, 0xa0 | |||
| vst U3, P1, 0xb0 | |||
| vst U4, P1, 0xc0 | |||
| vst U5, P1, 0xd0 | |||
| vst U6, P1, 0xe0 | |||
| vst U7, P1, 0xf0 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| addi.d I, I, -1 | |||
| add.d P1, P1, T1 | |||
| blt ZERO, I, .L_4I1 | |||
| .L_4N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_4N3 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vld U4, S3, 0x00 | |||
| vld U5, S3, 0x10 | |||
| vld U6, S4, 0x00 | |||
| vld U7, S4, 0x10 | |||
| vst U0, P2, 0x00 | |||
| vst U1, P2, 0x10 | |||
| vst U2, P2, 0x20 | |||
| vst U3, P2, 0x30 | |||
| vst U4, P2, 0x40 | |||
| vst U5, P2, 0x50 | |||
| vst U6, P2, 0x60 | |||
| vst U7, P2, 0x70 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d P2, P2, 0x80 | |||
| .L_4N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_4N1 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vld U2, S3, 0x00 | |||
| vld U3, S4, 0x00 | |||
| vst U0, P3, 0x00 | |||
| vst U1, P3, 0x10 | |||
| vst U2, P3, 0x20 | |||
| vst U3, P3, 0x30 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d S3, S3, 0x10 | |||
| addi.d S4, S4, 0x10 | |||
| addi.d P3, P3, 0x40 | |||
| .L_4N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M3 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fst.d F0, P4, 0x00 | |||
| fst.d F1, P4, 0x08 | |||
| fst.d F2, P4, 0x10 | |||
| fst.d F3, P4, 0x18 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d S3, S3, 0x08 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d P4, P4, 0x20 | |||
| .L_M3: | |||
| andi J, M, 0x02 | |||
| beq ZERO, J, .L_M1 | |||
| move S1, S0 | |||
| add.d S2, S0, TL | |||
| add.d S0, S0, T0 | |||
| move P1, P0 | |||
| addi.d P0, P0, 0x80 | |||
| srai.d I, N, 0x03 | |||
| beq ZERO, I, .L_2N7 | |||
| .L_2I1: /* I-- */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S1, 0x20 | |||
| vld U3, S1, 0x30 | |||
| vld U4, S2, 0x00 | |||
| vld U5, S2, 0x10 | |||
| vld U6, S2, 0x20 | |||
| vld U7, S2, 0x30 | |||
| vst U0, P1, 0x00 | |||
| vst U1, P1, 0x10 | |||
| vst U2, P1, 0x20 | |||
| vst U3, P1, 0x30 | |||
| vst U4, P1, 0x40 | |||
| vst U5, P1, 0x50 | |||
| vst U6, P1, 0x60 | |||
| vst U7, P1, 0x70 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d I, I, -1 | |||
| add.d P1, P1, T1 | |||
| blt ZERO, I, .L_2I1 | |||
| .L_2N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_2N3 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vst U0, P2, 0x00 | |||
| vst U1, P2, 0x10 | |||
| vst U2, P2, 0x20 | |||
| vst U3, P2, 0x30 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d P2, P2, 0x40 | |||
| .L_2N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_2N1 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vst U0, P3, 0x00 | |||
| vst U1, P3, 0x10 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d P3, P3, 0x20 | |||
| .L_2N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M1 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fst.d F0, P4, 0x00 | |||
| fst.d F1, P4, 0x08 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d P4, P4, 0x10 | |||
| .L_M1: | |||
| andi J, M, 0x01 | |||
| beq ZERO, J, .L_M0 | |||
| move S1, S0 | |||
| add.d S2, S0, TL | |||
| move P1, P0 | |||
| addi.d P0, P0, 0x40 | |||
| srai.d I, N, 0x03 | |||
| beq ZERO, I, .L_1N7 | |||
| .L_1I1: /* I-- */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S1, 0x20 | |||
| vld U3, S1, 0x30 | |||
| vst U0, P1, 0x00 | |||
| vst U1, P1, 0x10 | |||
| vst U2, P1, 0x20 | |||
| vst U3, P1, 0x30 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d I, I, -1 | |||
| add.d P1, P1, T1 | |||
| blt ZERO, I, .L_1I1 | |||
| .L_1N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_1N3 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vst U0, P2, 0x00 | |||
| vst U1, P2, 0x10 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d P2, P2, 0x20 | |||
| .L_1N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_1N1 | |||
| vld U0, S1, 0x00 | |||
| vst U0, P3, 0x00 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d P3, P3, 0x10 | |||
| .L_1N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M0 | |||
| fld.d F0, S1, 0x00 | |||
| fst.d F0, P4, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d P4, P4, 0x08 | |||
| .L_M0: | |||
| pop_if_used 24, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||