| @@ -75,14 +75,29 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| @@ -0,0 +1,340 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A00 x2 | |||
| #define LDA x3 | |||
| #define B00 x4 | |||
| #define A01 x5 | |||
| #define A02 x6 | |||
| #define A03 x7 | |||
| #define A04 x8 | |||
| #define I x9 | |||
| #define J x10 | |||
| #define TEMP1 x11 | |||
| #define TEMP2 x12 | |||
| #define A_PREFETCH 2560 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro COPY4x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ins v8.d[0], v0.d[0] | |||
| ins v10.d[0], v0.d[1] | |||
| ins v12.d[0], v1.d[0] | |||
| ins v14.d[0], v1.d[1] | |||
| ldp q2, q3, [A02], #32 | |||
| ins v8.d[1], v2.d[0] | |||
| ins v10.d[1], v2.d[1] | |||
| ins v12.d[1], v3.d[0] | |||
| ins v14.d[1], v3.d[1] | |||
| ldp q4, q5, [A03], #32 | |||
| ins v9.d[0], v4.d[0] | |||
| ins v11.d[0], v4.d[1] | |||
| ins v13.d[0], v5.d[0] | |||
| ins v15.d[0], v5.d[1] | |||
| ldp q6, q7, [A04], #32 | |||
| ins v9.d[1], v6.d[0] | |||
| ins v11.d[1], v6.d[1] | |||
| ins v13.d[1], v7.d[0] | |||
| ins v15.d[1], v7.d[1] | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] | |||
| add B00, B00, #32 | |||
| .endm | |||
| .macro COPY4x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ins v8.d[0], v0.d[0] | |||
| ins v9.d[0], v0.d[1] | |||
| ins v10.d[0], v1.d[0] | |||
| ins v11.d[0], v1.d[1] | |||
| ldp q2, q3, [A02], #32 | |||
| ins v8.d[1], v2.d[0] | |||
| ins v9.d[1], v2.d[1] | |||
| ins v10.d[1], v3.d[0] | |||
| ins v11.d[1], v3.d[1] | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| stp d0, d1, [B00] | |||
| add B00, B00, #16 | |||
| .endm | |||
| .macro COPY4x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| stp q0, q1, [B00], #32 | |||
| .endm | |||
| .macro COPY1x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| str d0, [B00], #8 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
| dgemm_ncopy_L4_BEGIN: | |||
| asr J, N, #2 // J = N / 4 | |||
| cmp J, #0 | |||
| ble dgemm_ncopy_L2_BEGIN | |||
| .align 5 | |||
| dgemm_ncopy_L4_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A00, A04, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L4_M4_40 | |||
| .align 5 | |||
| dgemm_ncopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L4_M4_20 | |||
| dgemm_ncopy_L4_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L4_M4_END | |||
| .align 5 | |||
| dgemm_ncopy_L4_M4_60: | |||
| COPY1x4 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L4_M4_60 | |||
| dgemm_ncopy_L4_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne dgemm_ncopy_L4_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L2_BEGIN: | |||
| tst N, #3 | |||
| ble dgemm_ncopy_L999 | |||
| tst N, #2 | |||
| ble dgemm_ncopy_L1_BEGIN | |||
| dgemm_ncopy_L2_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L2_M4_40 | |||
| .align 5 | |||
| dgemm_ncopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L2_M4_20 | |||
| dgemm_ncopy_L2_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L2_M4_END | |||
| .align 5 | |||
| dgemm_ncopy_L2_M4_60: | |||
| COPY1x2 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L2_M4_60 | |||
| dgemm_ncopy_L2_M4_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble dgemm_ncopy_L999 | |||
| dgemm_ncopy_L1_M4_BEGIN: | |||
| mov A01, A00 | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L1_M4_40 | |||
| .align 5 | |||
| dgemm_ncopy_L1_M4_20: | |||
| COPY4x1 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L1_M4_20 | |||
| dgemm_ncopy_L1_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L1_M4_END | |||
| .align 5 | |||
| dgemm_ncopy_L1_M4_60: | |||
| COPY1x1 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L1_M4_60 | |||
| dgemm_ncopy_L1_M4_END: | |||
| dgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,544 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A00 x2 | |||
| #define LDA x3 | |||
| #define B00 x4 | |||
| #define A01 x5 | |||
| #define A02 x6 | |||
| #define A03 x7 | |||
| #define A04 x8 | |||
| #define A05 x9 | |||
| #define A06 x10 | |||
| #define A07 x11 | |||
| #define A08 x12 | |||
| #define I x13 | |||
| #define J x14 | |||
| #define TEMP1 x15 | |||
| #define TEMP2 x16 | |||
| #define A_PREFETCH 2560 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| /*************************************************************************************/ | |||
| .macro COPY8x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| COPY4x8 | |||
| COPY4x8 | |||
| .endm | |||
| .macro COPY4x8 | |||
| ldp q0, q1, [A01], #32 | |||
| ins v16.d[0], v0.d[0] | |||
| ins v20.d[0], v0.d[1] | |||
| ins v24.d[0], v1.d[0] | |||
| ins v28.d[0], v1.d[1] | |||
| ldp q2, q3, [A02], #32 | |||
| ins v16.d[1], v2.d[0] | |||
| ins v20.d[1], v2.d[1] | |||
| ins v24.d[1], v3.d[0] | |||
| ins v28.d[1], v3.d[1] | |||
| ldp q4, q5, [A03], #32 | |||
| ins v17.d[0], v4.d[0] | |||
| ins v21.d[0], v4.d[1] | |||
| ins v25.d[0], v5.d[0] | |||
| ins v29.d[0], v5.d[1] | |||
| ldp q6, q7, [A04], #32 | |||
| ins v17.d[1], v6.d[0] | |||
| ins v21.d[1], v6.d[1] | |||
| ins v25.d[1], v7.d[0] | |||
| ins v29.d[1], v7.d[1] | |||
| ldp q8, q9, [A05], #32 | |||
| ins v18.d[0], v8.d[0] | |||
| ins v22.d[0], v8.d[1] | |||
| ins v26.d[0], v9.d[0] | |||
| ins v30.d[0], v9.d[1] | |||
| ldp q10, q11, [A06], #32 | |||
| ins v18.d[1], v10.d[0] | |||
| ins v22.d[1], v10.d[1] | |||
| ins v26.d[1], v11.d[0] | |||
| ins v30.d[1], v11.d[1] | |||
| ldp q12, q13, [A07], #32 | |||
| ins v19.d[0], v12.d[0] | |||
| ins v23.d[0], v12.d[1] | |||
| ins v27.d[0], v13.d[0] | |||
| ins v31.d[0], v13.d[1] | |||
| ldp q14, q15, [A08], #32 | |||
| ins v19.d[1], v14.d[0] | |||
| ins v23.d[1], v14.d[1] | |||
| ins v27.d[1], v15.d[0] | |||
| ins v31.d[1], v15.d[1] | |||
| st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| ldr d4, [A05], #8 | |||
| ldr d5, [A06], #8 | |||
| ldr d6, [A07], #8 | |||
| ldr d7, [A08], #8 | |||
| st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] | |||
| add B00, B00, #32 | |||
| st1 {v4.1d, v5.1d, v6.1d, v7.1d}, [B00] | |||
| add B00, B00, #32 | |||
| .endm | |||
| /*************************************************************************************/ | |||
| .macro COPY8x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ins v8.d[0], v0.d[0] | |||
| ins v10.d[0], v0.d[1] | |||
| ins v12.d[0], v1.d[0] | |||
| ins v14.d[0], v1.d[1] | |||
| ldp q2, q3, [A02], #32 | |||
| ins v8.d[1], v2.d[0] | |||
| ins v10.d[1], v2.d[1] | |||
| ins v12.d[1], v3.d[0] | |||
| ins v14.d[1], v3.d[1] | |||
| ldp q4, q5, [A03], #32 | |||
| ins v9.d[0], v4.d[0] | |||
| ins v11.d[0], v4.d[1] | |||
| ins v13.d[0], v5.d[0] | |||
| ins v15.d[0], v5.d[1] | |||
| ldp q6, q7, [A04], #32 | |||
| ins v9.d[1], v6.d[0] | |||
| ins v11.d[1], v6.d[1] | |||
| ins v13.d[1], v7.d[0] | |||
| ins v15.d[1], v7.d[1] | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] | |||
| add B00, B00, #64 | |||
| ldp q16, q17, [A01], #32 | |||
| ins v24.d[0], v16.d[0] | |||
| ins v26.d[0], v16.d[1] | |||
| ins v28.d[0], v17.d[0] | |||
| ins v30.d[0], v17.d[1] | |||
| ldp q18, q19, [A02], #32 | |||
| ins v24.d[1], v18.d[0] | |||
| ins v26.d[1], v18.d[1] | |||
| ins v28.d[1], v19.d[0] | |||
| ins v30.d[1], v19.d[1] | |||
| ldp q20, q21, [A03], #32 | |||
| ins v25.d[0], v20.d[0] | |||
| ins v27.d[0], v20.d[1] | |||
| ins v29.d[0], v21.d[0] | |||
| ins v31.d[0], v21.d[1] | |||
| ldp q22, q23, [A04], #32 | |||
| ins v25.d[1], v22.d[0] | |||
| ins v27.d[1], v22.d[1] | |||
| ins v29.d[1], v23.d[0] | |||
| ins v31.d[1], v23.d[1] | |||
| st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] | |||
| add B00, B00, #32 | |||
| .endm | |||
| /*************************************************************************************/ | |||
| .macro COPY8x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A01], #32 | |||
| ins v8.d[0], v0.d[0] | |||
| ins v9.d[0], v0.d[1] | |||
| ins v10.d[0], v1.d[0] | |||
| ins v11.d[0], v1.d[1] | |||
| ins v12.d[0], v2.d[0] | |||
| ins v13.d[0], v2.d[1] | |||
| ins v14.d[0], v3.d[0] | |||
| ins v15.d[0], v3.d[1] | |||
| ldp q4, q5, [A02], #32 | |||
| ldp q6, q7, [A02], #32 | |||
| ins v8.d[1], v4.d[0] | |||
| ins v9.d[1], v4.d[1] | |||
| ins v10.d[1], v5.d[0] | |||
| ins v11.d[1], v5.d[1] | |||
| ins v12.d[1], v6.d[0] | |||
| ins v13.d[1], v6.d[1] | |||
| ins v14.d[1], v7.d[0] | |||
| ins v15.d[1], v7.d[1] | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| stp d0, d1, [B00] | |||
| add B00, B00, #16 | |||
| .endm | |||
| /*************************************************************************************/ | |||
| .macro COPY8x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A01], #32 | |||
| stp q0, q1, [B00], #32 | |||
| stp q2, q3, [B00], #32 | |||
| .endm | |||
| .macro COPY1x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| str d0, [B00], #8 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
| dgemm_ncopy_L8_BEGIN: | |||
| asr J, N, #3 // J = N / 8 | |||
| cmp J, #0 | |||
| ble dgemm_ncopy_L4_BEGIN | |||
| dgemm_ncopy_L8_M8_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A05, A04, LDA | |||
| add A06, A05, LDA | |||
| add A07, A06, LDA | |||
| add A08, A07, LDA | |||
| add A00, A08, LDA | |||
| asr I, M, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L8_M8_40 | |||
| dgemm_ncopy_L8_M8_20: | |||
| COPY8x8 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L8_M8_20 | |||
| dgemm_ncopy_L8_M8_40: | |||
| and I, M , #7 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L8_M8_END | |||
| dgemm_ncopy_L8_M8_60: | |||
| COPY1x8 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L8_M8_60 | |||
| dgemm_ncopy_L8_M8_END: | |||
| subs J , J, #1 // j-- | |||
| bne dgemm_ncopy_L8_M8_BEGIN | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L4_BEGIN: | |||
| tst N, #7 | |||
| ble dgemm_ncopy_L999 | |||
| tst N, #4 | |||
| ble dgemm_ncopy_L2_BEGIN | |||
| dgemm_ncopy_L4_M8_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A00, A04, LDA | |||
| asr I, M, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L4_M8_40 | |||
| dgemm_ncopy_L4_M8_20: | |||
| COPY8x4 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L4_M8_20 | |||
| dgemm_ncopy_L4_M8_40: | |||
| and I, M , #7 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L4_M8_END | |||
| dgemm_ncopy_L4_M8_60: | |||
| COPY1x4 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L4_M8_60 | |||
| dgemm_ncopy_L4_M8_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L2_BEGIN: | |||
| tst N, #3 | |||
| ble dgemm_ncopy_L999 | |||
| tst N, #2 | |||
| ble dgemm_ncopy_L1_BEGIN | |||
| dgemm_ncopy_L2_M8_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L2_M8_40 | |||
| dgemm_ncopy_L2_M8_20: | |||
| COPY8x2 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L2_M8_20 | |||
| dgemm_ncopy_L2_M8_40: | |||
| and I, M , #7 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L2_M8_END | |||
| dgemm_ncopy_L2_M8_60: | |||
| COPY1x2 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L2_M8_60 | |||
| dgemm_ncopy_L2_M8_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble dgemm_ncopy_L999 | |||
| dgemm_ncopy_L1_M8_BEGIN: | |||
| mov A01, A00 | |||
| asr I, M, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L1_M8_40 | |||
| dgemm_ncopy_L1_M8_20: | |||
| COPY8x1 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L1_M8_20 | |||
| dgemm_ncopy_L1_M8_40: | |||
| and I, M , #7 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L1_M8_END | |||
| dgemm_ncopy_L1_M8_60: | |||
| COPY1x1 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L1_M8_60 | |||
| dgemm_ncopy_L1_M8_END: | |||
| dgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,402 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A x2 | |||
| #define LDA x3 | |||
| #define B x4 | |||
| #define M4 x5 | |||
| #define A01 x6 | |||
| #define A02 x7 | |||
| #define A03 x8 | |||
| #define A04 x9 | |||
| #define B01 x10 | |||
| #define B02 x11 | |||
| #define B03 x12 | |||
| #define B04 x13 | |||
| #define I x14 | |||
| #define J x15 | |||
| #define TEMP1 x16 | |||
| #define TEMP2 x17 | |||
| #define A_PREFETCH 2560 | |||
| #define B_PREFETCH 256 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro COPY4x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A02], #32 | |||
| ////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
| add TEMP1, B01, #64 | |||
| ldp q4, q5, [A03], #32 | |||
| ldp q6, q7, [A04], #32 | |||
| ////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||
| add B01, B01, M4 | |||
| .endm | |||
| .macro COPY2x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ldr q2, [A03], #16 | |||
| ldr q3, [A04], #16 | |||
| ////prfm PLDL1KEEP, [B02, #B_PREFETCH] | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||
| add B02, B02, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| ////prfm PLDL1KEEP, [B03, #B_PREFETCH] | |||
| st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B03] | |||
| add B03, B03, #32 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY4x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A02], #32 | |||
| ////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
| add B01, B01, M4 | |||
| .endm | |||
| .macro COPY2x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ////prfm PLDL1KEEP, [B02, #B_PREFETCH] | |||
| stp q0, q1, [B02] | |||
| add B02, B02, #32 | |||
| .endm | |||
| .macro COPY1x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ////prfm PLDL1KEEP, [B03, #B_PREFETCH] | |||
| stp d0, d1, [B03] | |||
| add B03, B03, #16 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY4x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||
| stp q0, q1, [B01] | |||
| add B01, B01, M4 | |||
| .endm | |||
| .macro COPY2x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ////prfm PLDL1KEEP, [B02, #B_PREFETCH] | |||
| str q0, [B02] | |||
| add B02, B02, #16 | |||
| .endm | |||
| .macro COPY1x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ////prfm PLDL1KEEP, [B03, #B_PREFETCH] | |||
| str d0, [B03] | |||
| add B03, B03, #8 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
| lsl TEMP1, M, #3 // x12 = M * SIZE | |||
| and B02 , N , #-4 | |||
| and B03 , N , #-2 | |||
| mul B02, B02, TEMP1 | |||
| mul B03, B03, TEMP1 | |||
| add B02 , B02, B | |||
| add B03 , B03, B | |||
| lsl M4, M, #5 // M4 = M * 4 * SIZE | |||
| dgemm_tcopy_L4_BEGIN: | |||
| asr J, M, #2 // J = M / 4 | |||
| cmp J, #0 | |||
| ble dgemm_tcopy_L2_BEGIN | |||
| .align 5 | |||
| dgemm_tcopy_L4_M4_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A, A04, LDA | |||
| mov B01, B | |||
| add B, B01, #128 // B = B + 16 * SIZE | |||
| asr I, N, #2 // I = N / 4 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L4_M4_40 | |||
| .align 5 | |||
| dgemm_tcopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L4_M4_20 | |||
| dgemm_tcopy_L4_M4_40: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L4_M4_60 | |||
| COPY2x4 | |||
| dgemm_tcopy_L4_M4_60: | |||
| tst N, #1 | |||
| ble dgemm_tcopy_L4_M4_END | |||
| COPY1x4 | |||
| dgemm_tcopy_L4_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne dgemm_tcopy_L4_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L2_BEGIN: | |||
| tst M, #3 | |||
| ble dgemm_tcopy_L999 | |||
| tst M, #2 | |||
| ble dgemm_tcopy_L1_BEGIN | |||
| dgemm_tcopy_L2_M4_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A, A02, LDA | |||
| mov B01, B | |||
| add B, B01, #64 // B = B + 8 * SIZE | |||
| asr I, N, #2 // I = N / 4 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L2_M4_40 | |||
| .align 5 | |||
| dgemm_tcopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L2_M4_20 | |||
| dgemm_tcopy_L2_M4_40: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L2_M4_60 | |||
| COPY2x2 | |||
| dgemm_tcopy_L2_M4_60: | |||
| tst N , #1 | |||
| ble dgemm_tcopy_L2_M4_END | |||
| COPY1x2 | |||
| dgemm_tcopy_L2_M4_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L1_BEGIN: | |||
| tst M, #1 | |||
| ble dgemm_tcopy_L999 | |||
| dgemm_tcopy_L1_M4_BEGIN: | |||
| mov A01, A // A01 = A | |||
| mov B01, B | |||
| asr I, N, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L1_M4_40 | |||
| .align 5 | |||
| dgemm_tcopy_L1_M4_20: | |||
| COPY4x1 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L1_M4_20 | |||
| dgemm_tcopy_L1_M4_40: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L1_M4_60 | |||
| COPY2x1 | |||
| dgemm_tcopy_L1_M4_60: | |||
| tst N , #1 | |||
| ble dgemm_tcopy_L1_M4_END | |||
| COPY1x1 | |||
| dgemm_tcopy_L1_M4_END: | |||
| dgemm_tcopy_L999: | |||
| mov x0, #0 // set return value | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,682 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A x2 | |||
| #define LDA x3 | |||
| #define B x4 | |||
| #define M8 x5 | |||
| #define A01 x6 | |||
| #define A02 x7 | |||
| #define A03 x8 | |||
| #define A04 x9 | |||
| #define A05 x10 | |||
| #define A06 x11 | |||
| #define A07 x12 | |||
| #define A08 x13 | |||
| #define B01 x14 | |||
| #define B02 x15 | |||
| #define B03 x16 | |||
| #define B04 x17 | |||
| #define I x18 | |||
| #define J x19 | |||
| #define TEMP1 x20 | |||
| #define TEMP2 x21 | |||
| #define A_PREFETCH 2560 | |||
| #define B_PREFETCH 256 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A01], #32 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
| add TEMP1, B01, #64 | |||
| ldp q4, q5, [A02], #32 | |||
| ldp q6, q7, [A02], #32 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q8, q9, [A03], #32 | |||
| ldp q10, q11, [A03], #32 | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q12, q13, [A04], #32 | |||
| ldp q14, q15, [A04], #32 | |||
| st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q16, q17, [A05], #32 | |||
| ldp q18, q19, [A05], #32 | |||
| st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q20, q21, [A06], #32 | |||
| ldp q22, q23, [A06], #32 | |||
| st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q24, q25, [A07], #32 | |||
| ldp q26, q27, [A07], #32 | |||
| st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q28, q29, [A08], #32 | |||
| ldp q30, q31, [A08], #32 | |||
| st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| add B01, B01, M8 | |||
| .endm | |||
| .macro COPY4x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A02], #32 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||
| add B02, B02, #64 | |||
| ldp q4, q5, [A03], #32 | |||
| ldp q6, q7, [A04], #32 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B02] | |||
| add B02, B02, #64 | |||
| ldp q8, q9, [A05], #32 | |||
| ldp q10, q11, [A06], #32 | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B02] | |||
| add B02, B02, #64 | |||
| ldp q12, q13, [A07], #32 | |||
| ldp q14, q15, [A08], #32 | |||
| st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B02] | |||
| add B02, B02, #64 | |||
| .endm | |||
| .macro COPY2x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ldr q2, [A03], #16 | |||
| ldr q3, [A04], #16 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B03] | |||
| add B03, B03, #64 | |||
| ldr q4, [A05], #16 | |||
| ldr q5, [A06], #16 | |||
| ldr q6, [A07], #16 | |||
| ldr q7, [A08], #16 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B03] | |||
| add B03, B03, #64 | |||
| .endm | |||
| .macro COPY1x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B04] | |||
| add B04, B04, #32 | |||
| ldr d4, [A05], #8 | |||
| ldr d5, [A06], #8 | |||
| ldr d6, [A07], #8 | |||
| ldr d7, [A08], #8 | |||
| st1 {v4.1d, v5.1d, v6.1d, v7.1d}, [B04] | |||
| add B04, B04, #32 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A01], #32 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
| add TEMP1, B01, #64 | |||
| ldp q4, q5, [A02], #32 | |||
| ldp q6, q7, [A02], #32 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q8, q9, [A03], #32 | |||
| ldp q10, q11, [A03], #32 | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q12, q13, [A04], #32 | |||
| ldp q14, q15, [A04], #32 | |||
| st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| add B01, B01, M8 | |||
| .endm | |||
| .macro COPY4x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A02], #32 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||
| add B02, B02, #64 | |||
| ldp q4, q5, [A03], #32 | |||
| ldp q6, q7, [A04], #32 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B02] | |||
| add B02, B02, #64 | |||
| .endm | |||
| .macro COPY2x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ldr q2, [A03], #16 | |||
| ldr q3, [A04], #16 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B03] | |||
| add B03, B03, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B04] | |||
| add B04, B04, #32 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A01], #32 | |||
| ldp q4, q5, [A02], #32 | |||
| ldp q6, q7, [A02], #32 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
| add TEMP1, B01, #64 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||
| add B01, B01, M8 | |||
| .endm | |||
| .macro COPY4x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A02], #32 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||
| add B02, B02, #64 | |||
| .endm | |||
| .macro COPY2x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| stp q0, q1, [B03] | |||
| add B03, B03, #32 | |||
| .endm | |||
| .macro COPY1x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| stp d0, d1, [B04] | |||
| add B04, B04, #16 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A01], #32 | |||
| stp q0, q1, [B01] | |||
| add TEMP1, B01, #32 | |||
| stp q2, q3, [TEMP1] | |||
| add B01, B01, M8 | |||
| .endm | |||
| .macro COPY4x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| stp q0, q1, [B02] | |||
| add B02, B02, #32 | |||
| .endm | |||
| .macro COPY2x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| str q0, [B03] | |||
| add B03, B03, #16 | |||
| .endm | |||
| .macro COPY1x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| str d0, [B04] | |||
| add B04, B04, #8 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
| lsl TEMP1, M, #3 // TEMP1 = M * SIZE | |||
| and B02 , N , #-8 | |||
| and B03 , N , #-4 | |||
| and B04 , N , #-2 | |||
| mul B02, B02, TEMP1 | |||
| mul B03, B03, TEMP1 | |||
| mul B04, B04, TEMP1 | |||
| add B02 , B02, B | |||
| add B03 , B03, B | |||
| add B04 , B04, B | |||
| lsl M8, M, #6 // M8 = M * 8 * SIZE | |||
| dgemm_tcopy_L8_BEGIN: | |||
| asr J, M, #3 // J = M / 4 | |||
| cmp J, #0 | |||
| ble dgemm_tcopy_L4_BEGIN | |||
| .align 5 | |||
| dgemm_tcopy_L8_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A05, A04, LDA | |||
| add A06, A05, LDA | |||
| add A07, A06, LDA | |||
| add A08, A07, LDA | |||
| add A, A08, LDA | |||
| mov B01, B | |||
| add B, B01, #512 // B = B + 64 * SIZE | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L8_M8_40 | |||
| .align 5 | |||
| dgemm_tcopy_L8_M8_20: | |||
| COPY8x8 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L8_M8_20 | |||
| dgemm_tcopy_L8_M8_40: | |||
| tst N , #4 | |||
| ble dgemm_tcopy_L8_M8_60 | |||
| COPY4x8 | |||
| dgemm_tcopy_L8_M8_60: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L8_M8_80 | |||
| COPY2x8 | |||
| dgemm_tcopy_L8_M8_80: | |||
| tst N, #1 | |||
| ble dgemm_tcopy_L8_M8_END | |||
| COPY1x8 | |||
| dgemm_tcopy_L8_M8_END: | |||
| subs J , J, #1 // j-- | |||
| bne dgemm_tcopy_L8_M8_BEGIN | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L4_BEGIN: | |||
| tst M, #7 | |||
| ble dgemm_tcopy_L999 | |||
| tst M, #4 | |||
| ble dgemm_tcopy_L2_BEGIN | |||
| dgemm_tcopy_L4_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A, A04, LDA | |||
| mov B01, B | |||
| add B, B01, #256 // B = B + 32 * SIZE | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L4_M8_40 | |||
| .align 5 | |||
| dgemm_tcopy_L4_M8_20: | |||
| COPY8x4 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L4_M8_20 | |||
| dgemm_tcopy_L4_M8_40: | |||
| tst N , #4 | |||
| ble dgemm_tcopy_L4_M8_60 | |||
| COPY4x4 | |||
| dgemm_tcopy_L4_M8_60: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L4_M8_80 | |||
| COPY2x4 | |||
| dgemm_tcopy_L4_M8_80: | |||
| tst N, #1 | |||
| ble dgemm_tcopy_L4_M8_END | |||
| COPY1x4 | |||
| dgemm_tcopy_L4_M8_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L2_BEGIN: | |||
| tst M, #3 | |||
| ble dgemm_tcopy_L999 | |||
| tst M, #2 | |||
| ble dgemm_tcopy_L1_BEGIN | |||
| dgemm_tcopy_L2_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A, A02, LDA | |||
| mov B01, B | |||
| add B, B01, #128 // B = B + 16 * SIZE | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L2_M8_40 | |||
| .align 5 | |||
| dgemm_tcopy_L2_M8_20: | |||
| COPY8x2 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L2_M8_20 | |||
| dgemm_tcopy_L2_M8_40: | |||
| tst N , #4 | |||
| ble dgemm_tcopy_L2_M8_60 | |||
| COPY4x2 | |||
| dgemm_tcopy_L2_M8_60: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L2_M8_80 | |||
| COPY2x2 | |||
| dgemm_tcopy_L2_M8_80: | |||
| tst N , #1 | |||
| ble dgemm_tcopy_L2_M8_END | |||
| COPY1x2 | |||
| dgemm_tcopy_L2_M8_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L1_BEGIN: | |||
| tst M, #1 | |||
| ble dgemm_tcopy_L999 | |||
| dgemm_tcopy_L1_M8_BEGIN: | |||
| mov A01, A // A01 = A | |||
| mov B01, B | |||
| asr I, N, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L1_M8_40 | |||
| .align 5 | |||
| dgemm_tcopy_L1_M8_20: | |||
| COPY8x1 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L1_M8_20 | |||
| dgemm_tcopy_L1_M8_40: | |||
| tst N , #4 | |||
| ble dgemm_tcopy_L1_M8_60 | |||
| COPY4x1 | |||
| dgemm_tcopy_L1_M8_60: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L1_M8_80 | |||
| COPY2x1 | |||
| dgemm_tcopy_L1_M8_80: | |||
| tst N , #1 | |||
| ble dgemm_tcopy_L1_M8_END | |||
| COPY1x1 | |||
| dgemm_tcopy_L1_M8_END: | |||
| dgemm_tcopy_L999: | |||
| mov x0, #0 // set return value | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||