Co-Authored-By: Ralph Campbell <ralph.campbell@broadcom.com>tags/v0.2.16.rc1
| @@ -50,4 +50,13 @@ DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| @@ -0,0 +1,320 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 /* Y vector length */ | |||
| #define N x1 /* X vector length */ | |||
| #define A x3 /* A vector address */ | |||
| #define LDA x4 /* A stride */ | |||
| #define X x5 /* X vector address */ | |||
| #define INC_X x6 /* X stride */ | |||
| #define Y x7 /* Y vector address */ | |||
| #define INC_Y x2 /* Y stride */ | |||
| #define A_PTR x9 /* loop A vector address */ | |||
| #define Y_IPTR x10 /* loop Y vector address */ | |||
| #define J x11 /* loop variable */ | |||
| #define I x12 /* loop variable */ | |||
| #define Y_OPTR x13 /* loop Y vector address */ | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| #if !defined(DOUBLE) | |||
| #define ALPHA s0 | |||
| #define TEMP s1 | |||
| #define TEMPV {v1.s}[0] | |||
| #define TMP1 s2 | |||
| #define TMPV1 {v2.s}[0] | |||
| #define TMP2 s3 | |||
| #define TMPV2 {v3.s}[0] | |||
| #define SZ 4 | |||
| #define SHZ 2 | |||
| #else | |||
| #define ALPHA d0 | |||
| #define TEMP d1 | |||
| #define TEMPV {v1.d}[0] | |||
| #define TMP1 d2 | |||
| #define TMPV1 {v2.d}[0] | |||
| #define TMP2 d3 | |||
| #define TMPV2 {v3.d}[0] | |||
| #define SZ 8 | |||
| #define SHZ 3 | |||
| #endif | |||
| /******************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro KERNEL_F16 | |||
| #if !defined(DOUBLE) | |||
| ld1 {v2.4s, v3.4s}, [A_PTR], #32 | |||
| ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 | |||
| fmla v4.4s, v1.4s, v2.4s | |||
| fmla v5.4s, v1.4s, v3.4s | |||
| st1 {v4.4s, v5.4s}, [Y_OPTR], #32 | |||
| ld1 {v6.4s, v7.4s}, [A_PTR], #32 | |||
| ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 | |||
| fmla v8.4s, v1.4s, v6.4s | |||
| fmla v9.4s, v1.4s, v7.4s | |||
| st1 {v8.4s, v9.4s}, [Y_OPTR], #32 | |||
| #else //DOUBLE | |||
| ld1 {v2.2d, v3.2d}, [A_PTR], #32 | |||
| ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 | |||
| fmla v4.2d, v1.2d, v2.2d | |||
| fmla v5.2d, v1.2d, v3.2d | |||
| st1 {v4.2d, v5.2d}, [Y_OPTR], #32 | |||
| ld1 {v6.2d, v7.2d}, [A_PTR], #32 | |||
| ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 | |||
| fmla v8.2d, v1.2d, v6.2d | |||
| fmla v9.2d, v1.2d, v7.2d | |||
| st1 {v8.2d, v9.2d}, [Y_OPTR], #32 | |||
| ld1 {v10.2d, v11.2d}, [A_PTR], #32 | |||
| ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 | |||
| fmla v12.2d, v1.2d, v10.2d | |||
| fmla v13.2d, v1.2d, v11.2d | |||
| st1 {v12.2d, v13.2d}, [Y_OPTR], #32 | |||
| ld1 {v14.2d, v15.2d}, [A_PTR], #32 | |||
| ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 | |||
| fmla v16.2d, v1.2d, v14.2d | |||
| fmla v17.2d, v1.2d, v15.2d | |||
| st1 {v16.2d, v17.2d}, [Y_OPTR], #32 | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F4 | |||
| #if !defined(DOUBLE) | |||
| ld1 {v2.4s}, [A_PTR], #16 | |||
| ld1 {v3.4s}, [Y_IPTR], #16 | |||
| fmla v3.4s, v1.4s, v2.4s | |||
| st1 {v3.4s}, [Y_OPTR], #16 | |||
| #else | |||
| ld1 {v2.2d}, [A_PTR], #16 | |||
| ld1 {v3.2d}, [Y_IPTR], #16 | |||
| fmla v3.2d, v1.2d, v2.2d | |||
| st1 {v3.2d}, [Y_OPTR], #16 | |||
| ld1 {v4.2d}, [A_PTR], #16 | |||
| ld1 {v5.2d}, [Y_IPTR], #16 | |||
| fmla v5.2d, v1.2d, v4.2d | |||
| st1 {v5.2d}, [Y_OPTR], #16 | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F1 | |||
| ld1 TMPV1, [A_PTR], #SZ | |||
| ld1 TMPV2, [Y_IPTR] | |||
| fmadd TMP2, TEMP, TMP1, TMP2 | |||
| st1 TMPV2, [Y_IPTR], #SZ | |||
| .endm | |||
| .macro INIT_S | |||
| lsl INC_Y, INC_Y, #SHZ | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| ld1 TMPV1, [A_PTR], #SZ | |||
| ld1 TMPV2, [Y_IPTR] | |||
| fmadd TMP2, TEMP, TMP1, TMP2 | |||
| st1 TMPV2, [Y_IPTR], INC_Y | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| ldr INC_Y, [sp] | |||
| SAVE_REGS | |||
| cmp N, xzr | |||
| ble gemv_n_kernel_L999 | |||
| cmp M, xzr | |||
| ble gemv_n_kernel_L999 | |||
| lsl LDA, LDA, #SHZ | |||
| lsl INC_X, INC_X, #SHZ | |||
| mov J, N | |||
| cmp INC_Y, #1 | |||
| bne gemv_n_kernel_S_BEGIN | |||
| gemv_n_kernel_F_LOOP: | |||
| ld1 TEMPV, [X], INC_X | |||
| fmul TEMP, ALPHA, TEMP | |||
| #if !defined(DOUBLE) | |||
| ins v1.s[1], v1.s[0] | |||
| ins v1.s[2], v1.s[0] | |||
| ins v1.s[3], v1.s[0] | |||
| #else | |||
| ins v1.d[1], v1.d[0] | |||
| #endif | |||
| mov A_PTR, A | |||
| mov Y_IPTR, Y | |||
| mov Y_OPTR, Y | |||
| gemv_n_kernel_F32: | |||
| asr I, M, #5 | |||
| cmp I, xzr | |||
| beq gemv_n_kernel_F4 | |||
| gemv_n_kernel_F320: | |||
| KERNEL_F16 | |||
| KERNEL_F16 | |||
| subs I, I, #1 | |||
| bne gemv_n_kernel_F320 | |||
| gemv_n_kernel_F4: | |||
| ands I, M, #31 | |||
| asr I, I, #2 | |||
| cmp I, xzr | |||
| beq gemv_n_kernel_F1 | |||
| gemv_n_kernel_F40: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne gemv_n_kernel_F40 | |||
| gemv_n_kernel_F1: | |||
| ands I, M, #3 | |||
| ble gemv_n_kernel_F_END | |||
| gemv_n_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne gemv_n_kernel_F10 | |||
| gemv_n_kernel_F_END: | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| bne gemv_n_kernel_F_LOOP | |||
| b gemv_n_kernel_L999 | |||
| gemv_n_kernel_S_BEGIN: | |||
| INIT_S | |||
| gemv_n_kernel_S_LOOP: | |||
| ld1 TEMPV, [X], INC_X | |||
| fmul TEMP, ALPHA, TEMP | |||
| mov A_PTR, A | |||
| mov Y_IPTR, Y | |||
| asr I, M, #2 | |||
| cmp I, xzr | |||
| ble gemv_n_kernel_S1 | |||
| gemv_n_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne gemv_n_kernel_S4 | |||
| gemv_n_kernel_S1: | |||
| ands I, M, #3 | |||
| ble gemv_n_kernel_S_END | |||
| gemv_n_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne gemv_n_kernel_S10 | |||
| gemv_n_kernel_S_END: | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| bne gemv_n_kernel_S_LOOP | |||
| gemv_n_kernel_L999: | |||
| mov w0, wzr | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,347 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 /* Y vector length */ | |||
| #define N x1 /* X vector length */ | |||
| #define A x3 /* A vector address */ | |||
| #define LDA x4 /* A stride */ | |||
| #define X x5 /* X vector address */ | |||
| #define INC_X x6 /* X stride */ | |||
| #define Y x7 /* Y vector address */ | |||
| #define INC_Y x2 /* Y stride */ | |||
| #define A_PTR x9 /* loop A vector address */ | |||
| #define X_PTR x10 /* loop X vector address */ | |||
| #define J x11 /* loop variable */ | |||
| #define I x12 /* loop variable */ | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| #if !defined(DOUBLE) | |||
| #define REG0 wzr | |||
| #define ALPHA s0 | |||
| #define TEMP s1 | |||
| #define TEMP1 s2 | |||
| #define TEMP2 s3 | |||
| #define TEMP3 s4 | |||
| #define TEMPV {v1.s}[0] | |||
| #define TMP1 s2 | |||
| #define TMPV1 {v2.s}[0] | |||
| #define TMP2 s3 | |||
| #define TMPV2 {v3.s}[0] | |||
| #define SZ 4 | |||
| #define SHZ 2 | |||
| #else | |||
| #define REG0 xzr | |||
| #define ALPHA d0 | |||
| #define TEMP d1 | |||
| #define TEMP1 d2 | |||
| #define TEMP2 d3 | |||
| #define TEMP3 d4 | |||
| #define TEMPV {v1.d}[0] | |||
| #define TMP1 d2 | |||
| #define TMPV1 {v2.d}[0] | |||
| #define TMP2 d3 | |||
| #define TMPV2 {v3.d}[0] | |||
| #define SZ 8 | |||
| #define SHZ 3 | |||
| #endif | |||
| /******************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro KERNEL_F32 | |||
| #if !defined(DOUBLE) | |||
| ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 | |||
| ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 | |||
| fmla v1.4s, v5.4s, v9.4s | |||
| fmla v2.4s, v6.4s, v10.4s | |||
| fmla v3.4s, v7.4s, v11.4s | |||
| fmla v4.4s, v8.4s, v12.4s | |||
| ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 | |||
| ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 | |||
| fmla v1.4s, v13.4s, v17.4s | |||
| fmla v2.4s, v14.4s, v18.4s | |||
| fmla v3.4s, v15.4s, v19.4s | |||
| fmla v4.4s, v16.4s, v20.4s | |||
| #else | |||
| ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 | |||
| ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v5.2d, v9.2d | |||
| fmla v2.2d, v6.2d, v10.2d | |||
| fmla v3.2d, v7.2d, v11.2d | |||
| fmla v4.2d, v8.2d, v12.2d | |||
| ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | |||
| ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v13.2d, v17.2d | |||
| fmla v2.2d, v14.2d, v18.2d | |||
| fmla v3.2d, v15.2d, v19.2d | |||
| fmla v4.2d, v16.2d, v20.2d | |||
| ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 | |||
| ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v5.2d, v9.2d | |||
| fmla v2.2d, v6.2d, v10.2d | |||
| fmla v3.2d, v7.2d, v11.2d | |||
| fmla v4.2d, v8.2d, v12.2d | |||
| ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | |||
| ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v13.2d, v17.2d | |||
| fmla v2.2d, v14.2d, v18.2d | |||
| fmla v3.2d, v15.2d, v19.2d | |||
| fmla v4.2d, v16.2d, v20.2d | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F32_FINALIZE | |||
| #if !defined(DOUBLE) | |||
| fadd v1.4s, v1.4s, v2.4s | |||
| fadd v1.4s, v1.4s, v3.4s | |||
| fadd v1.4s, v1.4s, v4.4s | |||
| #else | |||
| fadd v1.2d, v1.2d, v2.2d | |||
| fadd v1.2d, v1.2d, v3.2d | |||
| fadd v1.2d, v1.2d, v4.2d | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F4 | |||
| #if !defined(DOUBLE) | |||
| ld1 {v2.4s}, [A_PTR], #16 | |||
| ld1 {v3.4s}, [X_PTR], #16 | |||
| fmla v1.4s, v2.4s, v3.4s | |||
| #else | |||
| ld1 {v2.2d}, [A_PTR], #16 | |||
| ld1 {v3.2d}, [X_PTR], #16 | |||
| fmla v1.2d, v2.2d, v3.2d | |||
| ld1 {v4.2d}, [A_PTR], #16 | |||
| ld1 {v5.2d}, [X_PTR], #16 | |||
| fmla v1.2d, v4.2d, v5.2d | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F4_FINALIZE | |||
| #if !defined(DOUBLE) | |||
| ext v2.16b, v1.16b, v1.16b, #8 | |||
| fadd v1.2s, v1.2s, v2.2s | |||
| faddp TEMP, v1.2s | |||
| #else | |||
| faddp TEMP, v1.2d | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F1 | |||
| ld1 TMPV1, [A_PTR], #SZ | |||
| ld1 TMPV2, [X_PTR], #SZ | |||
| fmadd TEMP, TMP1, TMP2, TEMP | |||
| .endm | |||
| .macro INIT_S | |||
| lsl INC_X, INC_X, #SHZ | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| ld1 TMPV1, [A_PTR], #SZ | |||
| ld1 TMPV2, [X_PTR], INC_X | |||
| fmadd TEMP, TMP1, TMP2, TEMP | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| ldr INC_Y, [sp] | |||
| SAVE_REGS | |||
| cmp N, xzr | |||
| ble gemv_t_kernel_L999 | |||
| cmp M, xzr | |||
| ble gemv_t_kernel_L999 | |||
| lsl LDA, LDA, #SHZ | |||
| lsl INC_Y, INC_Y, #SHZ | |||
| mov J, N | |||
| cmp INC_X, #1 | |||
| bne gemv_t_kernel_S_BEGIN | |||
| gemv_t_kernel_F_LOOP: | |||
| fmov TEMP, REG0 | |||
| fmov TEMP1, REG0 | |||
| fmov TEMP2, REG0 | |||
| fmov TEMP3, REG0 | |||
| mov A_PTR, A | |||
| mov X_PTR, X | |||
| gemv_t_kernel_F32: | |||
| asr I, M, #5 | |||
| cmp I, xzr | |||
| beq gemv_t_kernel_F4 | |||
| gemv_t_kernel_F320: | |||
| KERNEL_F32 | |||
| subs I, I, #1 | |||
| bne gemv_t_kernel_F320 | |||
| KERNEL_F32_FINALIZE | |||
| gemv_t_kernel_F4: | |||
| ands I, M, #31 | |||
| asr I, I, #2 | |||
| cmp I, xzr | |||
| beq gemv_t_kernel_F1 | |||
| gemv_t_kernel_F40: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne gemv_t_kernel_F40 | |||
| gemv_t_kernel_F1: | |||
| KERNEL_F4_FINALIZE | |||
| ands I, M, #3 | |||
| ble gemv_t_kernel_F_END | |||
| gemv_t_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne gemv_t_kernel_F10 | |||
| gemv_t_kernel_F_END: | |||
| ld1 TMPV1, [Y] | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| fmadd TMP1, ALPHA, TEMP, TMP1 | |||
| st1 TMPV1, [Y], INC_Y | |||
| bne gemv_t_kernel_F_LOOP | |||
| b gemv_t_kernel_L999 | |||
| gemv_t_kernel_S_BEGIN: | |||
| INIT_S | |||
| gemv_t_kernel_S_LOOP: | |||
| fmov TEMP, REG0 | |||
| mov A_PTR, A | |||
| mov X_PTR, X | |||
| asr I, M, #2 | |||
| cmp I, xzr | |||
| ble gemv_t_kernel_S1 | |||
| gemv_t_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne gemv_t_kernel_S4 | |||
| gemv_t_kernel_S1: | |||
| ands I, M, #3 | |||
| ble gemv_t_kernel_S_END | |||
| gemv_t_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne gemv_t_kernel_S10 | |||
| gemv_t_kernel_S_END: | |||
| ld1 TMPV1, [Y] | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| fmadd TMP1, ALPHA, TEMP, TMP1 | |||
| st1 TMPV1, [Y], INC_Y | |||
| bne gemv_t_kernel_S_LOOP | |||
| gemv_t_kernel_L999: | |||
| RESTORE_REGS | |||
| mov w0, wzr | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,514 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 /* Y vector length */ | |||
| #define N x1 /* X vector length */ | |||
| #define A x3 /* A vector address */ | |||
| #define LDA x4 /* A stride */ | |||
| #define X x5 /* X vector address */ | |||
| #define INC_X x6 /* X stride */ | |||
| #define Y x7 /* Y vector address */ | |||
| #define INC_Y x2 /* Y stride */ | |||
| #define A_PTR x9 /* loop A vector address */ | |||
| #define Y_IPTR x10 /* loop Y vector address */ | |||
| #define J x11 /* loop variable */ | |||
| #define I x12 /* loop variable */ | |||
| #define Y_OPTR x13 /* loop Y vector address */ | |||
| #define X_PTR x14 /* loop X vector address */ | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| #if !defined(DOUBLE) | |||
| #define ALPHA_R s0 | |||
| #define ALPHA_I s1 | |||
| #define ALPHA_R_COPY s7 | |||
| #define ALPHA_I_COPY s8 | |||
| #define SHZ 3 | |||
| #else | |||
| #define ALPHA_R d0 | |||
| #define ALPHA_I d1 | |||
| #define ALPHA_R_COPY d7 | |||
| #define ALPHA_I_COPY d8 | |||
| #define SHZ 4 | |||
| #endif | |||
| /******************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro INIT | |||
| /********** INIT FOR F4 LOOP **********/ | |||
| fmov ALPHA_R_COPY, ALPHA_R | |||
| fmov ALPHA_I_COPY, ALPHA_I | |||
| #if !defined(DOUBLE) | |||
| ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA) | |||
| ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA) | |||
| ins v7.d[1], v7.d[0] | |||
| ins v8.d[1], v8.d[0] | |||
| #else | |||
| ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA) | |||
| ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA) | |||
| #endif | |||
| /******* INIT FOR F1 AND S1 LOOP ******/ | |||
| #if !defined(DOUBLE) | |||
| ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) | |||
| fneg s2, ALPHA_I | |||
| ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA) | |||
| #if !defined(XCONJ) | |||
| ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA) | |||
| #endif | |||
| #else | |||
| ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) | |||
| fneg d2, ALPHA_I | |||
| ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA) | |||
| #if !defined(XCONJ) | |||
| ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA) | |||
| #endif | |||
| #endif | |||
| .endm | |||
| .macro INIT_LOOP | |||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||
| #if !defined(DOUBLE) | |||
| ld1 {v9.2s}, [X_PTR] // [I(X), R(X)] | |||
| ins v10.s[0], v9.s[1] | |||
| ins v9.s[1], v9.s[0] // [R(X), R(X)] | |||
| ins v10.s[1], v10.s[0] // [I(X), I(X)] | |||
| ins v9.d[1], v9.d[0] | |||
| ins v10.d[1], v10.d[0] | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
| fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | |||
| fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] | |||
| fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] | |||
| #else | |||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
| fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] | |||
| fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] | |||
| fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
| fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] | |||
| fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] | |||
| fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] | |||
| #else | |||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
| fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | |||
| fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)] | |||
| fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)] | |||
| fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | |||
| #endif | |||
| #endif // CONJ | |||
| /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ | |||
| ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] | |||
| ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] | |||
| fmul v2.2s, v0.2s, v2.2s | |||
| fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] | |||
| ins v3.s[0], v2.s[1] | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fneg s4, s3 | |||
| ins v3.s[1], v4.s[0] | |||
| ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)] | |||
| ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | |||
| #else | |||
| fneg s4, s3 | |||
| ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)] | |||
| ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)] | |||
| fneg s4, s2 | |||
| ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | |||
| #else | |||
| fneg s3, s3 | |||
| ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)] | |||
| fneg s4, s2 | |||
| ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | |||
| #endif | |||
| #endif // CONJ | |||
| #else // DOUBLE | |||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||
| ld1 {v9.2d}, [X_PTR] // [I(X), R(X)] | |||
| ins v10.d[0], v9.d[1] | |||
| ins v9.d[1], v9.d[0] // [R(X), R(X)] | |||
| ins v10.d[1], v10.d[0] // [I(X), I(X)] | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
| fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | |||
| fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] | |||
| fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] | |||
| #else | |||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
| fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] | |||
| fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] | |||
| fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
| fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] | |||
| fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] | |||
| fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] | |||
| #else | |||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
| fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | |||
| fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)] | |||
| fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)] | |||
| fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | |||
| #endif | |||
| #endif // CONJ | |||
| /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ | |||
| ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] | |||
| ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] | |||
| fmul v2.2d, v0.2d, v2.2d | |||
| fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] | |||
| ins v3.d[0], v2.d[1] // I(TEMP) | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fneg d4, d3 // -I(TEMP) | |||
| ins v3.d[1], v4.d[0] | |||
| ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)] | |||
| ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | |||
| #else | |||
| fneg d4, d3 // -I(TEMP) | |||
| ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)] | |||
| ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)] | |||
| fneg d4, d2 // -R(TEMP) | |||
| ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | |||
| #else | |||
| fneg d3, d3 // -I(TEMP) | |||
| ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)] | |||
| fneg d4, d2 // -R(TEMP) | |||
| ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | |||
| #endif | |||
| #endif // CONJ | |||
| #endif // DOUBLE | |||
| .endm | |||
| .macro KERNEL_F4 | |||
| #if !defined(DOUBLE) | |||
| ld2 {v13.4s, v14.4s}, [A_PTR], #32 | |||
| ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||
| fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] | |||
| fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] | |||
| fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||
| fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] | |||
| fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] | |||
| fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||
| fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] | |||
| fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] | |||
| fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||
| fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] | |||
| fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] | |||
| fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] | |||
| #endif | |||
| #endif // CONJ | |||
| st2 {v15.4s, v16.4s}, [Y_OPTR], #32 | |||
| #else // DOUBLE | |||
| ld2 {v13.2d, v14.2d}, [A_PTR], #32 | |||
| ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||
| fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] | |||
| fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] | |||
| fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||
| fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] | |||
| fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] | |||
| fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||
| fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] | |||
| fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] | |||
| fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||
| fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] | |||
| fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] | |||
| fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] | |||
| #endif | |||
| #endif // CONJ | |||
| st2 {v15.2d, v16.2d}, [Y_OPTR], #32 | |||
| ld2 {v17.2d, v18.2d}, [A_PTR], #32 | |||
| ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||
| fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||
| fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||
| fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||
| fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||
| fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||
| fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||
| fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||
| fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||
| fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||
| fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||
| fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||
| fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||
| #endif | |||
| #endif // CONJ | |||
| st2 {v19.2d, v20.2d}, [Y_OPTR], #32 | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F1 | |||
| #if !defined(DOUBLE) | |||
| ld1 {v4.2s}, [A_PTR], #8 | |||
| ld1 {v5.2s}, [Y_IPTR], #8 | |||
| ext v6.8b, v4.8b, v4.8b, #4 | |||
| fmla v5.2s, v2.2s, v4.2s | |||
| fmla v5.2s, v3.2s, v6.2s | |||
| st1 {v5.2s}, [Y_OPTR], #8 | |||
| #else // DOUBLE | |||
| ld1 {v4.2d}, [A_PTR], #16 | |||
| ld1 {v5.2d}, [Y_IPTR], #16 | |||
| ext v6.16b, v4.16b, v4.16b, #8 | |||
| fmla v5.2d, v2.2d, v4.2d | |||
| fmla v5.2d, v3.2d, v6.2d | |||
| st1 {v5.2d}, [Y_OPTR], #16 | |||
| #endif | |||
| .endm | |||
| .macro INIT_S | |||
| lsl INC_Y, INC_Y, #SHZ | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| #if !defined(DOUBLE) | |||
| ld1 {v4.2s}, [A_PTR], #8 | |||
| ld1 {v5.2s}, [Y_IPTR], INC_Y | |||
| ext v6.8b, v4.8b, v4.8b, #4 | |||
| fmla v5.2s, v2.2s, v4.2s | |||
| fmla v5.2s, v3.2s, v6.2s | |||
| st1 {v5.2s}, [Y_OPTR], INC_Y | |||
| #else // DOUBLE | |||
| ld1 {v4.2d}, [A_PTR], #16 | |||
| ld1 {v5.2d}, [Y_IPTR], INC_Y | |||
| ext v6.16b, v4.16b, v4.16b, #8 | |||
| fmla v5.2d, v2.2d, v4.2d | |||
| fmla v5.2d, v3.2d, v6.2d | |||
| st1 {v5.2d}, [Y_OPTR], INC_Y | |||
| #endif | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| ldr INC_Y, [sp] | |||
| SAVE_REGS | |||
| cmp N, xzr | |||
| ble zgemv_n_kernel_L999 | |||
| cmp M, xzr | |||
| ble zgemv_n_kernel_L999 | |||
| lsl LDA, LDA, #SHZ | |||
| lsl INC_X, INC_X, #SHZ | |||
| mov J, N | |||
| INIT | |||
| cmp INC_Y, #1 | |||
| bne zgemv_n_kernel_S_BEGIN | |||
| zgemv_n_kernel_F_LOOP: | |||
| mov A_PTR, A | |||
| mov Y_IPTR, Y | |||
| mov Y_OPTR, Y | |||
| mov X_PTR, X | |||
| add X, X, INC_X | |||
| INIT_LOOP | |||
| asr I, M, #2 | |||
| cmp I, xzr | |||
| beq zgemv_n_kernel_F1 | |||
| zgemv_n_kernel_F4: | |||
| KERNEL_F1 | |||
| KERNEL_F1 | |||
| KERNEL_F1 | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne zgemv_n_kernel_F4 | |||
| zgemv_n_kernel_F1: | |||
| ands I, M, #3 | |||
| ble zgemv_n_kernel_F_END | |||
| zgemv_n_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne zgemv_n_kernel_F10 | |||
| zgemv_n_kernel_F_END: | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| bne zgemv_n_kernel_F_LOOP | |||
| b zgemv_n_kernel_L999 | |||
| zgemv_n_kernel_S_BEGIN: | |||
| INIT_S | |||
| zgemv_n_kernel_S_LOOP: | |||
| mov A_PTR, A | |||
| mov Y_IPTR, Y | |||
| mov Y_OPTR, Y | |||
| mov X_PTR, X | |||
| add X, X, INC_X | |||
| INIT_LOOP | |||
| asr I, M, #2 | |||
| cmp I, xzr | |||
| ble zgemv_n_kernel_S1 | |||
| zgemv_n_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne zgemv_n_kernel_S4 | |||
| zgemv_n_kernel_S1: | |||
| ands I, M, #3 | |||
| ble zgemv_n_kernel_S_END | |||
| zgemv_n_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne zgemv_n_kernel_S10 | |||
| zgemv_n_kernel_S_END: | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| bne zgemv_n_kernel_S_LOOP | |||
| zgemv_n_kernel_L999: | |||
| RESTORE_REGS | |||
| mov w0, wzr | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,448 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 /* Y vector length */ | |||
| #define N x1 /* X vector length */ | |||
| #define A x3 /* A vector address */ | |||
| #define LDA x4 /* A stride */ | |||
| #define X x5 /* X vector address */ | |||
| #define INC_X x6 /* X stride */ | |||
| #define Y x7 /* Y vector address */ | |||
| #define INC_Y x2 /* Y stride */ | |||
| #define A_PTR x9 /* loop A vector address */ | |||
| #define X_PTR x10 /* loop Y vector address */ | |||
| #define J x11 /* loop variable */ | |||
| #define I x12 /* loop variable */ | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| #if !defined(DOUBLE) | |||
| #define ALPHA_R s0 | |||
| #define ALPHA_I s1 | |||
| #define ALPHA_R_COPY s7 | |||
| #define ALPHA_I_COPY s8 | |||
| #define SHZ 3 | |||
| #else | |||
| #define ALPHA_R d0 | |||
| #define ALPHA_I d1 | |||
| #define ALPHA_R_COPY d7 | |||
| #define ALPHA_I_COPY d8 | |||
| #define SHZ 4 | |||
| #endif | |||
| /******************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro INIT | |||
| #if !defined(XCONJ) | |||
| #if !defined(DOUBLE) | |||
| ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R | |||
| fneg s2, ALPHA_I | |||
| ins v1.s[1], v2.s[0] | |||
| ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I | |||
| #else | |||
| ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R | |||
| fneg d2, ALPHA_I | |||
| ins v1.d[1], v2.d[0] | |||
| ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I | |||
| #endif | |||
| #else // XCONJ | |||
| #if !defined(DOUBLE) | |||
| fneg s2, ALPHA_R | |||
| ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R | |||
| ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I | |||
| #else | |||
| fneg d2, ALPHA_R | |||
| ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R | |||
| ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I | |||
| #endif | |||
| #endif | |||
| .endm | |||
| .macro INIT_LOOP | |||
| fmov d9, xzr // TEMP_R = [0, 0] | |||
| fmov d10, xzr // TEMP_I = [0, 0] | |||
| #if !defined(DOUBLE) | |||
| #else | |||
| fmov d15, xzr // TEMP_R = [0, 0] | |||
| fmov d16, xzr // TEMP_I = [0, 0] | |||
| #endif | |||
| fmov d2, xzr // TEMP = [0, 0] | |||
| .endm | |||
| .macro KERNEL_F4 | |||
| #if !defined(DOUBLE) | |||
| ld2 {v11.4s, v12.4s}, [X_PTR], #32 | |||
| ld2 {v13.4s, v14.4s}, [A_PTR], #32 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||
| fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] | |||
| fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] | |||
| fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | |||
| #else | |||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||
| fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] | |||
| fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] | |||
| fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||
| fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] | |||
| fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] | |||
| fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | |||
| #else | |||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||
| fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] | |||
| fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] | |||
| fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] | |||
| #endif | |||
| #endif // CONJ | |||
| #else // DOUBLE | |||
| ld2 {v11.2d, v12.2d}, [X_PTR], #32 | |||
| ld2 {v13.2d, v14.2d}, [A_PTR], #32 | |||
| prfm PLDL1STRM, [X_PTR, #512] | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||
| fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] | |||
| fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] | |||
| fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | |||
| #else | |||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||
| fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] | |||
| fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] | |||
| fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||
| fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] | |||
| fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] | |||
| fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | |||
| #else | |||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||
| fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] | |||
| fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] | |||
| fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] | |||
| #endif | |||
| #endif // CONJ | |||
| ld2 {v17.2d, v18.2d}, [X_PTR], #32 | |||
| ld2 {v19.2d, v20.2d}, [A_PTR], #32 | |||
| prfm PLDL1STRM, [A_PTR, #512] | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||
| fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||
| fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||
| fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||
| #else | |||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||
| fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||
| fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||
| fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||
| fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||
| fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||
| fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||
| #else | |||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||
| fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||
| fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||
| fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||
| #endif | |||
| #endif // CONJ | |||
| #endif //DOUBLE | |||
| .endm | |||
| .macro KERNEL_F4_FINALIZE | |||
| #if !defined(DOUBLE) | |||
| ext v21.16b, v9.16b, v9.16b, #8 | |||
| fadd v9.2s, v9.2s, v21.2s | |||
| faddp s9, v9.2s | |||
| ext v21.16b, v10.16b, v10.16b, #8 | |||
| fadd v10.2s, v10.2s, v21.2s | |||
| faddp s10, v10.2s | |||
| ins v2.s[0], v9.s[0] | |||
| ins v2.s[1], v10.s[0] | |||
| #else | |||
| fadd v9.2d, v9.2d, v15.2d | |||
| fadd v10.2d, v10.2d, v16.2d | |||
| faddp d9, v9.2d | |||
| faddp d10, v10.2d | |||
| ins v2.d[0], v9.d[0] | |||
| ins v2.d[1], v10.d[0] | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F1 | |||
| #if !defined(DOUBLE) | |||
| ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | |||
| ld1 {v5.s}[0], [A_PTR], #4 // A1 | |||
| ld1 {v6.2s}, [X_PTR], #8 // [X1, X0] | |||
| fneg s16, s5 | |||
| ins v5.s[1], v16.s[0] // [-A1, A1] | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | |||
| #endif | |||
| ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1] | |||
| fmla v2.2s, v4.2s, v6.2s | |||
| fmla v2.2s, v5.2s, v7.2s | |||
| #else // DOUBLE | |||
| ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | |||
| ld1 {v5.d}[0], [A_PTR], #8 // A1 | |||
| ld1 {v6.2d}, [X_PTR], #16 // [X1, X0] | |||
| fneg d16, d5 | |||
| ins v5.d[1], v16.d[0] // [-A1, A1] | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | |||
| #endif | |||
| ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1] | |||
| fmla v2.2d, v4.2d, v6.2d | |||
| fmla v2.2d, v5.2d, v7.2d | |||
| #endif | |||
| .endm | |||
| .macro INIT_S | |||
| lsl INC_X, INC_X, #SHZ | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| #if !defined(DOUBLE) | |||
| ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | |||
| ld1 {v5.s}[0], [A_PTR], #4 // A1 | |||
| ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0] | |||
| fneg s16, s5 | |||
| ins v5.s[1], v16.s[0] // [-A1, A1] | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | |||
| #endif | |||
| ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1] | |||
| fmla v2.2s, v4.2s, v6.2s | |||
| fmla v2.2s, v5.2s, v7.2s | |||
| #else // DOUBLE | |||
| ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | |||
| ld1 {v5.d}[0], [A_PTR], #8 // A1 | |||
| ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0] | |||
| fneg d16, d5 | |||
| ins v5.d[1], v16.d[0] // [-A1, A1] | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | |||
| #endif | |||
| ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1] | |||
| fmla v2.2d, v4.2d, v6.2d | |||
| fmla v2.2d, v5.2d, v7.2d | |||
| #endif | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| ldr INC_Y, [sp] | |||
| SAVE_REGS | |||
| cmp N, xzr | |||
| ble zgemv_t_kernel_L999 | |||
| cmp M, xzr | |||
| ble zgemv_t_kernel_L999 | |||
| lsl LDA, LDA, #SHZ | |||
| lsl INC_Y, INC_Y, #SHZ | |||
| mov J, N | |||
| INIT | |||
| cmp INC_X, #1 | |||
| bne zgemv_t_kernel_S_BEGIN | |||
| zgemv_t_kernel_F_LOOP: | |||
| mov A_PTR, A | |||
| mov X_PTR, X | |||
| INIT_LOOP | |||
| asr I, M, #2 | |||
| cmp I, xzr | |||
| beq zgemv_t_kernel_F1 | |||
| zgemv_t_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne zgemv_t_kernel_F4 | |||
| KERNEL_F4_FINALIZE | |||
| zgemv_t_kernel_F1: | |||
| ands I, M, #3 | |||
| ble zgemv_t_kernel_F_END | |||
| zgemv_t_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne zgemv_t_kernel_F10 | |||
| zgemv_t_kernel_F_END: | |||
| #if !defined(DOUBLE) | |||
| ld1 {v4.2s}, [Y] | |||
| ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I] | |||
| fmla v4.2s, v0.2s, v2.2s | |||
| fmla v4.2s, v1.2s, v3.2s | |||
| st1 {v4.2s}, [Y], INC_Y | |||
| #else // DOUBLE | |||
| ld1 {v4.2d}, [Y] | |||
| ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I] | |||
| fmla v4.2d, v0.2d, v2.2d | |||
| fmla v4.2d, v1.2d, v3.2d | |||
| st1 {v4.2d}, [Y], INC_Y | |||
| #endif | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| bne zgemv_t_kernel_F_LOOP | |||
| b zgemv_t_kernel_L999 | |||
| zgemv_t_kernel_S_BEGIN: | |||
| INIT_S | |||
| zgemv_t_kernel_S_LOOP: | |||
| mov A_PTR, A | |||
| mov X_PTR, X | |||
| INIT_LOOP | |||
| asr I, M, #2 | |||
| cmp I, xzr | |||
| ble zgemv_t_kernel_S1 | |||
| zgemv_t_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne zgemv_t_kernel_S4 | |||
| zgemv_t_kernel_S1: | |||
| ands I, M, #3 | |||
| ble zgemv_t_kernel_S_END | |||
| zgemv_t_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne zgemv_t_kernel_S10 | |||
| zgemv_t_kernel_S_END: | |||
| #if !defined(DOUBLE) | |||
| ld1 {v4.2s}, [Y] | |||
| ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I] | |||
| fmla v4.2s, v0.2s, v2.2s | |||
| fmla v4.2s, v1.2s, v3.2s | |||
| st1 {v4.2s}, [Y], INC_Y | |||
| #else // DOUBLE | |||
| ld1 {v4.2d}, [Y] | |||
| ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I] | |||
| fmla v4.2d, v0.2d, v2.2d | |||
| fmla v4.2d, v1.2d, v3.2d | |||
| st1 {v4.2d}, [Y], INC_Y | |||
| #endif | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| bne zgemv_t_kernel_S_LOOP | |||
| zgemv_t_kernel_L999: | |||
| RESTORE_REGS | |||
| mov w0, wzr | |||
| ret | |||
| EPILOGUE | |||