| @@ -5,8 +5,8 @@ DAMAXKERNEL = amax.S | |||||
| CAMAXKERNEL = zamax.S | CAMAXKERNEL = zamax.S | ||||
| ZAMAXKERNEL = zamax.S | ZAMAXKERNEL = zamax.S | ||||
| ISAMAXKERNEL = isamax.S | |||||
| IDAMAXKERNEL = idamax.S | |||||
| ISAMAXKERNEL = iamax.S | |||||
| IDAMAXKERNEL = iamax.S | |||||
| ICAMAXKERNEL = izamax.S | ICAMAXKERNEL = izamax.S | ||||
| IZAMAXKERNEL = izamax.S | IZAMAXKERNEL = izamax.S | ||||
| @@ -25,22 +25,22 @@ DCOPYKERNEL = copy.S | |||||
| CCOPYKERNEL = copy.S | CCOPYKERNEL = copy.S | ||||
| ZCOPYKERNEL = copy.S | ZCOPYKERNEL = copy.S | ||||
| DOTKERNEL = dot.S | |||||
| SDOTKERNEL = dot.S | |||||
| DDOTKERNEL = dot.S | DDOTKERNEL = dot.S | ||||
| CDOTKERNEL = zdot.S | CDOTKERNEL = zdot.S | ||||
| ZDOTKERNEL = zdot.S | ZDOTKERNEL = zdot.S | ||||
| SNRM2KERNEL = snrm2.S | |||||
| DNRM2KERNEL = dnrm2.S | |||||
| CNRM2KERNEL = znrm2.S | |||||
| ZNRM2KERNEL = znrm2.S | |||||
| #SNRM2KERNEL = snrm2.S | |||||
| #DNRM2KERNEL = dnrm2.S | |||||
| #CNRM2KERNEL = znrm2.S | |||||
| #ZNRM2KERNEL = znrm2.S | |||||
| SROTKERNEL = rot.S | SROTKERNEL = rot.S | ||||
| DROTKERNEL = rot.S | DROTKERNEL = rot.S | ||||
| CROTKERNEL = zrot.S | CROTKERNEL = zrot.S | ||||
| ZROTKERNEL = zrot.S | ZROTKERNEL = zrot.S | ||||
| SCALKERNEL = scal.S | |||||
| SSCALKERNEL = scal.S | |||||
| DSCALKERNEL = scal.S | DSCALKERNEL = scal.S | ||||
| CSCALKERNEL = zscal.S | CSCALKERNEL = zscal.S | ||||
| ZSCALKERNEL = zscal.S | ZSCALKERNEL = zscal.S | ||||
| @@ -181,73 +181,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmul v16.4s, v0.4s, v8.4s[0] | fmul v16.4s, v0.4s, v8.4s[0] | ||||
| OP_ii v16.4s, v1.4s, v9.4s[0] | OP_ii v16.4s, v1.4s, v9.4s[0] | ||||
| fmul v17.4s, v0.4s, v9.4s[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v17.4s, v17.4s | |||||
| eor v17.16b, v17.16b, v17.16b | |||||
| fmls v17.4s, v0.4s, v9.4s[0] | |||||
| #else | |||||
| fmul v17.4s, v0.4s, v9.4s[0] | |||||
| #endif | #endif | ||||
| OP_ir v17.4s, v1.4s, v8.4s[0] | OP_ir v17.4s, v1.4s, v8.4s[0] | ||||
| fmul v20.4s, v0.4s, v8.4s[1] | fmul v20.4s, v0.4s, v8.4s[1] | ||||
| OP_ii v20.4s, v1.4s, v9.4s[1] | OP_ii v20.4s, v1.4s, v9.4s[1] | ||||
| fmul v21.4s, v0.4s, v9.4s[1] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v21.4s, v21.4s | |||||
| eor v21.16b, v21.16b, v21.16b | |||||
| fmls v21.4s, v0.4s, v9.4s[1] | |||||
| #else | |||||
| fmul v21.4s, v0.4s, v9.4s[1] | |||||
| #endif | #endif | ||||
| OP_ir v21.4s, v1.4s, v8.4s[1] | OP_ir v21.4s, v1.4s, v8.4s[1] | ||||
| fmul v24.4s, v0.4s, v8.4s[2] | fmul v24.4s, v0.4s, v8.4s[2] | ||||
| OP_ii v24.4s, v1.4s, v9.4s[2] | OP_ii v24.4s, v1.4s, v9.4s[2] | ||||
| fmul v25.4s, v0.4s, v9.4s[2] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v25.4s, v25.4s | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fmls v25.4s, v0.4s, v9.4s[2] | |||||
| #else | |||||
| fmul v25.4s, v0.4s, v9.4s[2] | |||||
| #endif | #endif | ||||
| OP_ir v25.4s, v1.4s, v8.4s[2] | OP_ir v25.4s, v1.4s, v8.4s[2] | ||||
| fmul v28.4s, v0.4s, v8.4s[3] | fmul v28.4s, v0.4s, v8.4s[3] | ||||
| OP_ii v28.4s, v1.4s, v9.4s[3] | OP_ii v28.4s, v1.4s, v9.4s[3] | ||||
| fmul v29.4s, v0.4s, v9.4s[3] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v29.4s, v29.4s | |||||
| eor v29.16b, v29.16b, v29.16b | |||||
| fmls v29.4s, v0.4s, v9.4s[3] | |||||
| #else | |||||
| fmul v29.4s, v0.4s, v9.4s[3] | |||||
| #endif | #endif | ||||
| OP_ir v29.4s, v1.4s, v8.4s[3] | OP_ir v29.4s, v1.4s, v8.4s[3] | ||||
| fmul v18.4s, v2.4s, v8.4s[0] | fmul v18.4s, v2.4s, v8.4s[0] | ||||
| OP_ii v18.4s, v3.4s, v9.4s[0] | OP_ii v18.4s, v3.4s, v9.4s[0] | ||||
| fmul v19.4s, v2.4s, v9.4s[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v19.4s, v19.4s | |||||
| eor v19.16b, v19.16b, v19.16b | |||||
| fmls v19.4s, v2.4s, v9.4s[0] | |||||
| #else | |||||
| fmul v19.4s, v2.4s, v9.4s[0] | |||||
| #endif | #endif | ||||
| OP_ir v19.4s, v3.4s, v8.4s[0] | OP_ir v19.4s, v3.4s, v8.4s[0] | ||||
| fmul v22.4s, v2.4s, v8.4s[1] | fmul v22.4s, v2.4s, v8.4s[1] | ||||
| OP_ii v22.4s, v3.4s, v9.4s[1] | OP_ii v22.4s, v3.4s, v9.4s[1] | ||||
| fmul v23.4s, v2.4s, v9.4s[1] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v23.4s, v23.4s | |||||
| eor v23.16b, v23.16b, v23.16b | |||||
| fmls v23.4s, v2.4s, v9.4s[1] | |||||
| #else | |||||
| fmul v23.4s, v2.4s, v9.4s[1] | |||||
| #endif | #endif | ||||
| OP_ir v23.4s, v3.4s, v8.4s[1] | OP_ir v23.4s, v3.4s, v8.4s[1] | ||||
| fmul v26.4s, v2.4s, v8.4s[2] | fmul v26.4s, v2.4s, v8.4s[2] | ||||
| OP_ii v26.4s, v3.4s, v9.4s[2] | OP_ii v26.4s, v3.4s, v9.4s[2] | ||||
| fmul v27.4s, v2.4s, v9.4s[2] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v27.4s, v27.4s | |||||
| eor v27.16b, v27.16b, v27.16b | |||||
| fmls v27.4s, v2.4s, v9.4s[2] | |||||
| #else | |||||
| fmul v27.4s, v2.4s, v9.4s[2] | |||||
| #endif | #endif | ||||
| OP_ir v27.4s, v3.4s, v8.4s[2] | OP_ir v27.4s, v3.4s, v8.4s[2] | ||||
| fmul v30.4s, v2.4s, v8.4s[3] | fmul v30.4s, v2.4s, v8.4s[3] | ||||
| OP_ii v30.4s, v3.4s, v9.4s[3] | OP_ii v30.4s, v3.4s, v9.4s[3] | ||||
| fmul v31.4s, v2.4s, v9.4s[3] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v31.4s, v31.4s | |||||
| eor v31.16b, v31.16b, v31.16b | |||||
| fmls v31.4s, v2.4s, v9.4s[3] | |||||
| #else | |||||
| fmul v31.4s, v2.4s, v9.4s[3] | |||||
| #endif | #endif | ||||
| OP_ir v31.4s, v3.4s, v8.4s[3] | OP_ir v31.4s, v3.4s, v8.4s[3] | ||||
| @@ -172,37 +172,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmul v16.4s, v0.4s, v8.4s[0] | fmul v16.4s, v0.4s, v8.4s[0] | ||||
| OP_ii v16.4s, v1.4s, v9.4s[0] | OP_ii v16.4s, v1.4s, v9.4s[0] | ||||
| fmul v17.4s, v0.4s, v9.4s[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v17.4s, v17.4s | |||||
| eor v17.16b, v17.16b, v17.16b | |||||
| fmls v17.4s, v0.4s, v9.4s[0] | |||||
| #else | |||||
| fmul v17.4s, v0.4s, v9.4s[0] | |||||
| #endif | #endif | ||||
| OP_ir v17.4s, v1.4s, v8.4s[0] | OP_ir v17.4s, v1.4s, v8.4s[0] | ||||
| fmul v20.4s, v0.4s, v8.4s[1] | fmul v20.4s, v0.4s, v8.4s[1] | ||||
| OP_ii v20.4s, v1.4s, v9.4s[1] | OP_ii v20.4s, v1.4s, v9.4s[1] | ||||
| fmul v21.4s, v0.4s, v9.4s[1] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v21.4s, v21.4s | |||||
| eor v21.16b, v21.16b, v21.16b | |||||
| fmls v21.4s, v0.4s, v9.4s[1] | |||||
| #else | |||||
| fmul v21.4s, v0.4s, v9.4s[1] | |||||
| #endif | #endif | ||||
| OP_ir v21.4s, v1.4s, v8.4s[1] | OP_ir v21.4s, v1.4s, v8.4s[1] | ||||
| fmul v24.4s, v0.4s, v8.4s[2] | fmul v24.4s, v0.4s, v8.4s[2] | ||||
| OP_ii v24.4s, v1.4s, v9.4s[2] | OP_ii v24.4s, v1.4s, v9.4s[2] | ||||
| fmul v25.4s, v0.4s, v9.4s[2] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v25.4s, v25.4s | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fmls v25.4s, v0.4s, v9.4s[2] | |||||
| #else | |||||
| fmul v25.4s, v0.4s, v9.4s[2] | |||||
| #endif | #endif | ||||
| OP_ir v25.4s, v1.4s, v8.4s[2] | OP_ir v25.4s, v1.4s, v8.4s[2] | ||||
| fmul v28.4s, v0.4s, v8.4s[3] | fmul v28.4s, v0.4s, v8.4s[3] | ||||
| OP_ii v28.4s, v1.4s, v9.4s[3] | OP_ii v28.4s, v1.4s, v9.4s[3] | ||||
| fmul v29.4s, v0.4s, v9.4s[3] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v29.4s, v29.4s | |||||
| eor v29.16b, v29.16b, v29.16b | |||||
| fmls v29.4s, v0.4s, v9.4s[3] | |||||
| #else | |||||
| fmul v29.4s, v0.4s, v9.4s[3] | |||||
| #endif | #endif | ||||
| OP_ir v29.4s, v1.4s, v8.4s[3] | OP_ir v29.4s, v1.4s, v8.4s[3] | ||||
| @@ -45,16 +45,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define COND ge | #define COND ge | ||||
| #endif | #endif | ||||
| #if !defined(DOUBLE) | |||||
| #define MAXF s0 | |||||
| #define TMPF s1 | |||||
| #define TMPVF {v1.s}[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define MAXF d0 | #define MAXF d0 | ||||
| #define TMPF d1 | #define TMPF d1 | ||||
| #define TMPVF {v1.d}[0] | #define TMPVF {v1.d}[0] | ||||
| #define SZ 8 | #define SZ 8 | ||||
| #endif | |||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| .macro INIT_S | .macro INIT_S | ||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #2 | |||||
| ld1 {v0.s}[0], [X], INC_X | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 | lsl INC_X, INC_X, #3 | ||||
| ld1 {v0.d}[0], [X], INC_X | ld1 {v0.d}[0], [X], INC_X | ||||
| #endif | |||||
| mov Z, #1 | mov Z, #1 | ||||
| mov INDEX, Z | mov INDEX, Z | ||||
| fabs MAXF, MAXF | fabs MAXF, MAXF | ||||
| @@ -107,9 +119,8 @@ iamax_kernel_S1: | |||||
| iamax_kernel_S10: | iamax_kernel_S10: | ||||
| KERNEL_S1 | KERNEL_S1 | ||||
| subs I, I, #1 | |||||
| bne iamax_kernel_S10 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_S10 | |||||
| iamax_kernel_L999: | iamax_kernel_L999: | ||||
| @@ -1,213 +0,0 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define INDEX x3 /* index of max/min value */ | |||||
| #define Z x4 /* vector index */ | |||||
| #define I x5 /* loop variable */ | |||||
| #define X_COPY x6 /* copy of X address */ | |||||
| #define MAXF_Z x7 | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #define MAXF s5 | |||||
| #define TMPF s6 | |||||
| #define TMPVF {v6.s}[0] | |||||
| #define SZ 4 | |||||
| /******************************************************************************/ | |||||
| .macro INIT_F1 | |||||
| ldr MAXF, [X], #SZ | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| fabs MAXF, MAXF | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| ldr TMPF, [X], #SZ | |||||
| add Z, Z, #1 | |||||
| fabs TMPF, TMPF | |||||
| fcmp TMPF, MAXF | |||||
| fcsel MAXF, MAXF, TMPF, le | |||||
| csel INDEX, INDEX, Z, le | |||||
| .endm | |||||
| .macro INIT_F4 | |||||
| ld1 {v0.4s}, [X], #16 | |||||
| fabs v0.4s, v0.4s | |||||
| fmaxv MAXF, v0.4s | |||||
| mov Z, #5 | |||||
| mov MAXF_Z, #1 | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| ld1 {v0.4s}, [X], #16 | |||||
| fabs v0.4s, v0.4s | |||||
| fmaxv TMPF, v0.4s | |||||
| PRFM PLDL1KEEP, [X, #512] | |||||
| fcmp TMPF, MAXF | |||||
| fcsel MAXF, MAXF, TMPF, le | |||||
| csel MAXF_Z, MAXF_Z, Z, le | |||||
| add Z, Z, #4 | |||||
| .endm | |||||
| .macro KERNEL_F4_FINALIZE | |||||
| mov INDEX, MAXF_Z | |||||
| sub MAXF_Z, MAXF_Z, #1 | |||||
| lsl MAXF_Z, MAXF_Z, #2 | |||||
| add X_COPY, X_COPY, MAXF_Z | |||||
| ldr TMPF, [X_COPY], #SZ | |||||
| fabs TMPF, TMPF | |||||
| fcmp TMPF, MAXF | |||||
| beq KERNEL_F4_FINALIZE_DONE | |||||
| add INDEX, INDEX, #1 | |||||
| ldr TMPF, [X_COPY], #SZ | |||||
| fabs TMPF, TMPF | |||||
| fcmp TMPF, MAXF | |||||
| beq KERNEL_F4_FINALIZE_DONE | |||||
| add INDEX, INDEX, #1 | |||||
| ldr TMPF, [X_COPY], #SZ | |||||
| fabs TMPF, TMPF | |||||
| fcmp TMPF, MAXF | |||||
| beq KERNEL_F4_FINALIZE_DONE | |||||
| add INDEX, INDEX, #1 | |||||
| KERNEL_F4_FINALIZE_DONE: | |||||
| .endm | |||||
| .macro INIT_S | |||||
| lsl INC_X, INC_X, #2 | |||||
| ld1 TMPVF, [X], INC_X | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| fabs MAXF, TMPF | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 TMPVF, [X], INC_X | |||||
| add Z, Z, #1 | |||||
| fabs TMPF, TMPF | |||||
| fcmp TMPF, MAXF | |||||
| fcsel MAXF, MAXF, TMPF, le | |||||
| csel INDEX, INDEX, Z, le | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble iamax_kernel_zero | |||||
| cmp INC_X, xzr | |||||
| ble iamax_kernel_zero | |||||
| PRFM PLDL1KEEP, [X] | |||||
| mov X_COPY, X | |||||
| cmp INC_X, #1 | |||||
| bne iamax_kernel_S_BEGIN | |||||
| iamax_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq iamax_kernel_F1_INIT | |||||
| INIT_F4 | |||||
| subs I, I, #1 | |||||
| beq iamax_kernel_F4_FINALIZE | |||||
| iamax_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_F4 | |||||
| iamax_kernel_F4_FINALIZE: | |||||
| KERNEL_F4_FINALIZE | |||||
| iamax_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble iamax_kernel_L999 | |||||
| iamax_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_F10 | |||||
| b iamax_kernel_L999 | |||||
| iamax_kernel_F1_INIT: | |||||
| INIT_F1 | |||||
| subs N, N, #1 | |||||
| b iamax_kernel_F1 | |||||
| iamax_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| subs N, N, #1 | |||||
| ble iamax_kernel_L999 | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble iamax_kernel_S1 | |||||
| iamax_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_S4 | |||||
| iamax_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble iamax_kernel_L999 | |||||
| iamax_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_S10 | |||||
| iamax_kernel_L999: | |||||
| mov x0, INDEX | |||||
| ret | |||||
| iamax_kernel_zero: | |||||
| mov x0, xzr | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -59,10 +59,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro INIT_F1 | .macro INIT_F1 | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| fneg s2, S | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub s2, s2, S | |||||
| ins v1.s[1], v2.s[0] // [-S, S] | ins v1.s[1], v2.s[0] // [-S, S] | ||||
| #else | #else | ||||
| fneg d2, S | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub d2, d2, S | |||||
| ins v1.d[1], v2.d[0] // [-S, S] | ins v1.d[1], v2.d[0] // [-S, S] | ||||
| #endif | #endif | ||||
| .endm | .endm | ||||
| @@ -43,14 +43,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define DA_R s0 /* scale input value */ | #define DA_R s0 /* scale input value */ | ||||
| #define DA_I s1 /* scale input value */ | #define DA_I s1 /* scale input value */ | ||||
| #define TMPX v2.2s | |||||
| #define TMPY v3.2s | |||||
| #define SZ 4 | #define SZ 4 | ||||
| #else | #else | ||||
| #define DA_R d0 /* scale input value */ | #define DA_R d0 /* scale input value */ | ||||
| #define DA_I d1 /* scale input value */ | #define DA_I d1 /* scale input value */ | ||||
| #define TMPX v2.2d | |||||
| #define TMPY v3.2d | |||||
| #define SZ 8 | #define SZ 8 | ||||
| #endif | #endif | ||||
| @@ -61,22 +57,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | ||||
| fneg s2, DA_I | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub s2, s2, DA_I | |||||
| ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | ||||
| ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I | ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I | ||||
| #else | #else | ||||
| ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | ||||
| fneg d2, DA_I | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub d2, d2, DA_I | |||||
| ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I | ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I | ||||
| ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I | ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I | ||||
| #endif | #endif | ||||
| #else | #else | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| fneg s2, DA_R | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub s2, s2, DA_R | |||||
| ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R | ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R | ||||
| ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I | ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I | ||||
| #else | #else | ||||
| fneg d2, DA_R | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub d2, d2, DA_R | |||||
| ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R | ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R | ||||
| ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I | ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I | ||||
| #endif | #endif | ||||
| @@ -111,9 +111,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro KERNEL_INIT_F4 | .macro KERNEL_INIT_F4 | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| // Replicate the lower 2 floats into the upper 2 slots | |||||
| ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R | |||||
| ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I | |||||
| ins v16.s[0], v0.s[0] | |||||
| ins v16.s[1], v16.s[0] | |||||
| ins v16.d[1], v16.d[0] | |||||
| #if !defined(CONJ) | |||||
| ins v17.s[0], v1.s[1] | |||||
| #else | |||||
| ins v17.s[0], v1.s[0] | |||||
| #endif | |||||
| ins v17.s[1], v17.s[0] | |||||
| ins v17.d[1], v17.d[0] | |||||
| #else //DOUBLE | |||||
| ins v16.d[0], v0.d[0] | |||||
| ins v16.d[1], v16.d[0] | |||||
| #if !defined(CONJ) | |||||
| ins v17.d[0], v1.d[1] | |||||
| #else | |||||
| ins v17.d[0], v1.d[0] | |||||
| #endif | |||||
| ins v17.d[1], v17.d[0] | |||||
| #endif | #endif | ||||
| .endm | .endm | ||||
| @@ -121,55 +137,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro KERNEL_F4 | .macro KERNEL_F4 | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ld1 {v2.4s,v3.4s}, [X], #32 // V2 = X[3], X[2], X[1], X[0] | |||||
| // V3 = X[7], X[6], X[5], X[4] | |||||
| ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1] | |||||
| ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1] | |||||
| ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1] | |||||
| ld2 {v2.4s, v3.4s}, [X], #32 | |||||
| ld2 {v4.4s, v5.4s}, [Y_COPY], #32 | |||||
| ld1 {v4.4s,v5.4s}, [Y] // V4 = Y[3], Y[2], Y[1], Y[0] | |||||
| // V5 = Y[7], Y[6], Y[5], Y[4] | |||||
| ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5] | |||||
| ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5] | |||||
| ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5] | |||||
| fmla v4.4s, v2.4s, v16.4s | |||||
| #if !defined(CONJ) | |||||
| fmls v4.4s, v3.4s, v17.4s | |||||
| #else | |||||
| fmla v4.4s, v3.4s, v17.4s | |||||
| #endif | |||||
| fmla v4.4s, v0.4s, v2.4s // Y[iy] += DA_R * X[ix] | |||||
| // Y[iy+1] += +-DA_R * X[ix+1] | |||||
| fmla v4.4s, v1.4s, v6.4s // Y[iy] += +-DA_I * X[ix+1] | |||||
| // Y[iy+1] += DA_I * X[ix] | |||||
| st1 {v4.4s}, [Y], #16 | |||||
| #if !defined(CONJ) | |||||
| fmla v5.4s, v2.4s, v17.4s | |||||
| #else | |||||
| fmls v5.4s, v2.4s, v17.4s | |||||
| #endif | |||||
| fmla v5.4s, v3.4s, v16.4s | |||||
| fmla v5.4s, v0.4s, v3.4s // Y[iy] += DA_R * X[ix] | |||||
| fmla v5.4s, v1.4s, v7.4s // Y[iy] += +-DA_I * X[ix+1] | |||||
| // Y[iy+1] += +-DA_R * X[ix+1] | |||||
| // Y[iy+1] += DA_I * X[ix] | |||||
| st1 {v5.4s}, [Y], #16 | |||||
| st2 {v4.4s, v5.4s}, [Y], #32 | |||||
| #else // DOUBLE | #else // DOUBLE | ||||
| ld1 {v2.2d,v3.2d}, [X], #32 // CX0, CX1, CX2, CX3 | |||||
| ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1] | |||||
| ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1] | |||||
| ld2 {v2.2d, v3.2d}, [X], #32 | |||||
| ld2 {v4.2d, v5.2d}, [Y_COPY], #32 | |||||
| ld1 {v4.2d,v5.2d}, [X], #32 // CX0, CX1, CX2, CX3 | |||||
| ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1] | |||||
| ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1] | |||||
| ld1 {v16.2d,v17.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3 | |||||
| fmla v4.2d, v2.2d, v16.2d | |||||
| #if !defined(CONJ) | |||||
| fmls v4.2d, v3.2d, v17.2d | |||||
| #else | |||||
| fmla v4.2d, v3.2d, v17.2d | |||||
| #endif | |||||
| #if !defined(CONJ) | |||||
| fmla v5.2d, v2.2d, v17.2d | |||||
| #else | |||||
| fmls v5.2d, v2.2d, v17.2d | |||||
| #endif | |||||
| fmla v5.2d, v3.2d, v16.2d | |||||
| fmla v16.2d, v0.2d, v2.2d | |||||
| fmla v17.2d, v0.2d, v3.2d | |||||
| st2 {v4.2d, v5.2d}, [Y], #32 | |||||
| ld1 {v18.2d,v19.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3 | |||||
| ld2 {v18.2d, v19.2d}, [X], #32 | |||||
| ld2 {v20.2d, v21.2d}, [Y_COPY], #32 | |||||
| fmla v16.2d, v1.2d, v20.2d | |||||
| fmla v17.2d, v1.2d, v21.2d | |||||
| st1 {v16.2d,v17.2d}, [Y], #32 | |||||
| fmla v20.2d, v18.2d, v16.2d | |||||
| #if !defined(CONJ) | |||||
| fmls v20.2d, v19.2d, v17.2d | |||||
| #else | |||||
| fmla v20.2d, v19.2d, v17.2d | |||||
| #endif | |||||
| #if !defined(CONJ) | |||||
| fmla v21.2d, v18.2d, v17.2d | |||||
| #else | |||||
| fmls v21.2d, v18.2d, v17.2d | |||||
| #endif | |||||
| fmla v21.2d, v19.2d, v16.2d | |||||
| fmla v18.2d, v0.2d, v4.2d | |||||
| fmla v19.2d, v0.2d, v5.2d | |||||
| fmla v18.2d, v1.2d, v22.2d | |||||
| fmla v19.2d, v1.2d, v23.2d | |||||
| st1 {v18.2d,v19.2d}, [Y], #32 | |||||
| st2 {v20.2d, v21.2d}, [Y], #32 | |||||
| #endif | #endif | ||||
| PRFM PLDL1KEEP, [X, #512] | PRFM PLDL1KEEP, [X, #512] | ||||
| PRFM PLDL1KEEP, [Y, #512] | PRFM PLDL1KEEP, [Y, #512] | ||||
| @@ -184,73 +184,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmul v16.2d, v0.2d, v8.2d[0] | fmul v16.2d, v0.2d, v8.2d[0] | ||||
| OP_ii v16.2d, v1.2d, v9.2d[0] | OP_ii v16.2d, v1.2d, v9.2d[0] | ||||
| fmul v17.2d, v0.2d, v9.2d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v17.2d, v17.2d | |||||
| eor v17.16b, v17.16b, v17.16b | |||||
| fmls v17.2d, v0.2d, v9.2d[0] | |||||
| #else | |||||
| fmul v17.2d, v0.2d, v9.2d[0] | |||||
| #endif | #endif | ||||
| OP_ir v17.2d, v1.2d, v8.2d[0] | OP_ir v17.2d, v1.2d, v8.2d[0] | ||||
| fmul v18.2d, v2.2d, v8.2d[0] | fmul v18.2d, v2.2d, v8.2d[0] | ||||
| OP_ii v18.2d, v3.2d, v9.2d[0] | OP_ii v18.2d, v3.2d, v9.2d[0] | ||||
| fmul v19.2d, v2.2d, v9.2d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v19.2d, v19.2d | |||||
| eor v19.16b, v19.16b, v19.16b | |||||
| fmls v19.2d, v2.2d, v9.2d[0] | |||||
| #else | |||||
| fmul v19.2d, v2.2d, v9.2d[0] | |||||
| #endif | #endif | ||||
| OP_ir v19.2d, v3.2d, v8.2d[0] | OP_ir v19.2d, v3.2d, v8.2d[0] | ||||
| fmul v20.2d, v0.2d, v8.2d[1] | fmul v20.2d, v0.2d, v8.2d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.2d[1] | OP_ii v20.2d, v1.2d, v9.2d[1] | ||||
| fmul v21.2d, v0.2d, v9.2d[1] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v21.2d, v21.2d | |||||
| eor v21.16b, v21.16b, v21.16b | |||||
| fmls v21.2d, v0.2d, v9.2d[1] | |||||
| #else | |||||
| fmul v21.2d, v0.2d, v9.2d[1] | |||||
| #endif | #endif | ||||
| OP_ir v21.2d, v1.2d, v8.2d[1] | OP_ir v21.2d, v1.2d, v8.2d[1] | ||||
| fmul v22.2d, v2.2d, v8.2d[1] | fmul v22.2d, v2.2d, v8.2d[1] | ||||
| OP_ii v22.2d, v3.2d, v9.2d[1] | OP_ii v22.2d, v3.2d, v9.2d[1] | ||||
| fmul v23.2d, v2.2d, v9.2d[1] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v23.2d, v23.2d | |||||
| eor v23.16b, v23.16b, v23.16b | |||||
| fmls v23.2d, v2.2d, v9.2d[1] | |||||
| #else | |||||
| fmul v23.2d, v2.2d, v9.2d[1] | |||||
| #endif | #endif | ||||
| OP_ir v23.2d, v3.2d, v8.2d[1] | OP_ir v23.2d, v3.2d, v8.2d[1] | ||||
| fmul v24.2d, v0.2d, v10.2d[0] | fmul v24.2d, v0.2d, v10.2d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.2d[0] | OP_ii v24.2d, v1.2d, v11.2d[0] | ||||
| fmul v25.2d, v0.2d, v11.2d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v25.2d, v25.2d | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fmls v25.2d, v0.2d, v11.2d[0] | |||||
| #else | |||||
| fmul v25.2d, v0.2d, v11.2d[0] | |||||
| #endif | #endif | ||||
| OP_ir v25.2d, v1.2d, v10.2d[0] | OP_ir v25.2d, v1.2d, v10.2d[0] | ||||
| fmul v26.2d, v2.2d, v10.2d[0] | fmul v26.2d, v2.2d, v10.2d[0] | ||||
| OP_ii v26.2d, v3.2d, v11.2d[0] | OP_ii v26.2d, v3.2d, v11.2d[0] | ||||
| fmul v27.2d, v2.2d, v11.2d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v27.2d, v27.2d | |||||
| eor v27.16b, v27.16b, v27.16b | |||||
| fmls v27.2d, v2.2d, v11.2d[0] | |||||
| #else | |||||
| fmul v27.2d, v2.2d, v11.2d[0] | |||||
| #endif | #endif | ||||
| OP_ir v27.2d, v3.2d, v10.2d[0] | OP_ir v27.2d, v3.2d, v10.2d[0] | ||||
| fmul v28.2d, v0.2d, v10.2d[1] | fmul v28.2d, v0.2d, v10.2d[1] | ||||
| OP_ii v28.2d, v1.2d, v11.2d[1] | OP_ii v28.2d, v1.2d, v11.2d[1] | ||||
| fmul v29.2d, v0.2d, v11.2d[1] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v29.2d, v29.2d | |||||
| eor v29.16b, v29.16b, v29.16b | |||||
| fmls v29.2d, v0.2d, v11.2d[1] | |||||
| #else | |||||
| fmul v29.2d, v0.2d, v11.2d[1] | |||||
| #endif | #endif | ||||
| OP_ir v29.2d, v1.2d, v10.2d[1] | OP_ir v29.2d, v1.2d, v10.2d[1] | ||||
| fmul v30.2d, v2.2d, v10.2d[1] | fmul v30.2d, v2.2d, v10.2d[1] | ||||
| OP_ii v30.2d, v3.2d, v11.2d[1] | OP_ii v30.2d, v3.2d, v11.2d[1] | ||||
| fmul v31.2d, v2.2d, v11.2d[1] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v31.2d, v31.2d | |||||
| eor v31.16b, v31.16b, v31.16b | |||||
| fmls v31.2d, v2.2d, v11.2d[1] | |||||
| #else | |||||
| fmul v31.2d, v2.2d, v11.2d[1] | |||||
| #endif | #endif | ||||
| OP_ir v31.2d, v3.2d, v10.2d[1] | OP_ir v31.2d, v3.2d, v10.2d[1] | ||||
| @@ -110,15 +110,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /******* INIT FOR F1 AND S1 LOOP ******/ | /******* INIT FOR F1 AND S1 LOOP ******/ | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) | |||||
| fneg s2, ALPHA_I | |||||
| ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub s2, s2, ALPHA_I | |||||
| ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA) | ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA) | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA) | ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA) | ||||
| #endif | #endif | ||||
| #else | #else | ||||
| ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) | |||||
| fneg d2, ALPHA_I | |||||
| ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub d2, d2, ALPHA_I | |||||
| ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA) | ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA) | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA) | ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA) | ||||
| @@ -156,8 +158,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | ||||
| fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | ||||
| fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)] | |||||
| fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)] | |||||
| eor v12.16b, v12.16b, v12.16b | |||||
| fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] | |||||
| fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | ||||
| #endif | #endif | ||||
| #endif // CONJ | #endif // CONJ | ||||
| @@ -170,24 +172,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ins v3.s[0], v2.s[1] | ins v3.s[0], v2.s[1] | ||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| fneg s4, s3 | |||||
| eor v4.16b, v4.16b, v4.16b | |||||
| fsub s4, s4, s3 | |||||
| ins v3.s[1], v4.s[0] | ins v3.s[1], v4.s[0] | ||||
| ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)] | ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)] | ||||
| ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | ||||
| #else | #else | ||||
| fneg s4, s3 | |||||
| eor v4.16b, v4.16b, v4.16b | |||||
| fsub s4, s4, s3 | |||||
| ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)] | ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)] | ||||
| ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | ||||
| #endif | #endif | ||||
| #else // CONJ | #else // CONJ | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)] | ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)] | ||||
| fneg s4, s2 | |||||
| eor v4.16b, v4.16b, v4.16b | |||||
| fsub s4, s4, s2 | |||||
| ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | ||||
| #else | #else | ||||
| fneg s3, s3 | |||||
| eor v4.16b, v4.16b, v4.16b | |||||
| fsub s3, s4, s3 | |||||
| ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)] | ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)] | ||||
| fneg s4, s2 | |||||
| eor v4.16b, v4.16b, v4.16b | |||||
| fsub s4, s4, s2 | |||||
| ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | ||||
| #endif | #endif | ||||
| #endif // CONJ | #endif // CONJ | ||||
| @@ -220,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | ||||
| fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | ||||
| fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)] | |||||
| fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)] | |||||
| eor v12.16b, v12.16b, v12.16b | |||||
| fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] | |||||
| fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | ||||
| #endif | #endif | ||||
| #endif // CONJ | #endif // CONJ | ||||
| @@ -234,24 +241,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ins v3.d[0], v2.d[1] // I(TEMP) | ins v3.d[0], v2.d[1] // I(TEMP) | ||||
| #if !defined(CONJ) | #if !defined(CONJ) | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| fneg d4, d3 // -I(TEMP) | |||||
| eor v4.16b, v4.16b, v4.16b | |||||
| fsub d4, d4, d3 | |||||
| ins v3.d[1], v4.d[0] | ins v3.d[1], v4.d[0] | ||||
| ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)] | ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)] | ||||
| ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | ||||
| #else | #else | ||||
| fneg d4, d3 // -I(TEMP) | |||||
| eor v4.16b, v4.16b, v4.16b | |||||
| fsub d4, d4, d3 | |||||
| ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)] | ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)] | ||||
| ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | ||||
| #endif | #endif | ||||
| #else // CONJ | #else // CONJ | ||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)] | ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)] | ||||
| fneg d4, d2 // -R(TEMP) | |||||
| eor v4.16b, v4.16b, v4.16b | |||||
| fsub d4, d4, d2 | |||||
| ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | ||||
| #else | #else | ||||
| fneg d3, d3 // -I(TEMP) | |||||
| eor v4.16b, v4.16b, v4.16b | |||||
| fsub d3, d4, d3 | |||||
| ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)] | ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)] | ||||
| fneg d4, d2 // -R(TEMP) | |||||
| eor v4.16b, v4.16b, v4.16b | |||||
| fsub d4, d4, d2 | |||||
| ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | ||||
| #endif | #endif | ||||
| #endif // CONJ | #endif // CONJ | ||||
| @@ -96,22 +96,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if !defined(XCONJ) | #if !defined(XCONJ) | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R | ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R | ||||
| fneg s2, ALPHA_I | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub s2, s2, ALPHA_I | |||||
| ins v1.s[1], v2.s[0] | ins v1.s[1], v2.s[0] | ||||
| ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I | ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I | ||||
| #else | #else | ||||
| ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R | ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R | ||||
| fneg d2, ALPHA_I | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub d2, d2, ALPHA_I | |||||
| ins v1.d[1], v2.d[0] | ins v1.d[1], v2.d[0] | ||||
| ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I | ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I | ||||
| #endif | #endif | ||||
| #else // XCONJ | #else // XCONJ | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| fneg s2, ALPHA_R | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub s2, s2, ALPHA_R | |||||
| ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R | ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R | ||||
| ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I | ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I | ||||
| #else | #else | ||||
| fneg d2, ALPHA_R | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub d2, d2, ALPHA_R | |||||
| ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R | ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R | ||||
| ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I | ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I | ||||
| #endif | #endif | ||||
| @@ -136,89 +140,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v11.4s, v12.4s}, [X_PTR], #32 | ld2 {v11.4s, v12.4s}, [X_PTR], #32 | ||||
| ld2 {v13.4s, v14.4s}, [A_PTR], #32 | ld2 {v13.4s, v14.4s}, [A_PTR], #32 | ||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | ||||
| fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] | fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] | ||||
| fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] | fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] | ||||
| fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | ||||
| #else | #else | ||||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||||
| fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] | |||||
| fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] | |||||
| fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | ||||
| fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] | fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] | ||||
| fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] | fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] | ||||
| fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | ||||
| #else | |||||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||||
| fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] | |||||
| fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] | |||||
| fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] | |||||
| #endif | #endif | ||||
| #endif // CONJ | |||||
| #else // DOUBLE | #else // DOUBLE | ||||
| ld2 {v11.2d, v12.2d}, [X_PTR], #32 | ld2 {v11.2d, v12.2d}, [X_PTR], #32 | ||||
| ld2 {v13.2d, v14.2d}, [A_PTR], #32 | ld2 {v13.2d, v14.2d}, [A_PTR], #32 | ||||
| prfm PLDL1STRM, [X_PTR, #512] | prfm PLDL1STRM, [X_PTR, #512] | ||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | ||||
| fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] | fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] | ||||
| fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] | fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] | ||||
| fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | ||||
| #else | #else | ||||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||||
| fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] | |||||
| fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] | |||||
| fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | ||||
| fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] | fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] | ||||
| fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] | fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] | ||||
| fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | ||||
| #else | |||||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||||
| fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] | |||||
| fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] | |||||
| fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] | |||||
| #endif | #endif | ||||
| #endif // CONJ | |||||
| ld2 {v17.2d, v18.2d}, [X_PTR], #32 | ld2 {v17.2d, v18.2d}, [X_PTR], #32 | ||||
| ld2 {v19.2d, v20.2d}, [A_PTR], #32 | ld2 {v19.2d, v20.2d}, [A_PTR], #32 | ||||
| prfm PLDL1STRM, [A_PTR, #512] | prfm PLDL1STRM, [A_PTR, #512] | ||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | ||||
| fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | ||||
| fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | ||||
| fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | ||||
| #else | #else | ||||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||||
| fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||||
| fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||||
| fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | ||||
| fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | ||||
| fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | ||||
| fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | ||||
| #else | |||||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||||
| fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||||
| fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||||
| fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||||
| #endif | #endif | ||||
| #endif // CONJ | |||||
| #endif //DOUBLE | #endif //DOUBLE | ||||
| .endm | .endm | ||||
| @@ -252,7 +218,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | ||||
| ld1 {v5.s}[0], [A_PTR], #4 // A1 | ld1 {v5.s}[0], [A_PTR], #4 // A1 | ||||
| ld1 {v6.2s}, [X_PTR], #8 // [X1, X0] | ld1 {v6.2s}, [X_PTR], #8 // [X1, X0] | ||||
| fneg s16, s5 | |||||
| eor v16.16b, v16.16b, v16.16b | |||||
| fsub s16, s16, s5 | |||||
| ins v5.s[1], v16.s[0] // [-A1, A1] | ins v5.s[1], v16.s[0] // [-A1, A1] | ||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | ||||
| ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | ||||
| @@ -264,7 +231,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | ||||
| ld1 {v5.d}[0], [A_PTR], #8 // A1 | ld1 {v5.d}[0], [A_PTR], #8 // A1 | ||||
| ld1 {v6.2d}, [X_PTR], #16 // [X1, X0] | ld1 {v6.2d}, [X_PTR], #16 // [X1, X0] | ||||
| fneg d16, d5 | |||||
| eor v16.16b, v16.16b, v16.16b | |||||
| fsub d16, d16, d5 | |||||
| ins v5.d[1], v16.d[0] // [-A1, A1] | ins v5.d[1], v16.d[0] // [-A1, A1] | ||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | ||||
| ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | ||||
| @@ -284,7 +252,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | ||||
| ld1 {v5.s}[0], [A_PTR], #4 // A1 | ld1 {v5.s}[0], [A_PTR], #4 // A1 | ||||
| ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0] | ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0] | ||||
| fneg s16, s5 | |||||
| eor v16.16b, v16.16b, v16.16b | |||||
| fsub s16, s16, s5 | |||||
| ins v5.s[1], v16.s[0] // [-A1, A1] | ins v5.s[1], v16.s[0] // [-A1, A1] | ||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | ||||
| ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | ||||
| @@ -296,7 +265,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | ||||
| ld1 {v5.d}[0], [A_PTR], #8 // A1 | ld1 {v5.d}[0], [A_PTR], #8 // A1 | ||||
| ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0] | ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0] | ||||
| fneg d16, d5 | |||||
| eor v16.16b, v16.16b, v16.16b | |||||
| fsub d16, d16, d5 | |||||
| ins v5.d[1], v16.d[0] // [-A1, A1] | ins v5.d[1], v16.d[0] // [-A1, A1] | ||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | ||||
| ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | ||||
| @@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define X x3 /* X vector address */ | #define X x3 /* X vector address */ | ||||
| #define INC_X x4 /* X stride */ | #define INC_X x4 /* X stride */ | ||||
| #define I x5 /* loop variable */ | #define I x5 /* loop variable */ | ||||
| #define X_COPY x6 /* Copy of X */ | |||||
| /******************************************************************************* | /******************************************************************************* | ||||
| * Macro definitions | * Macro definitions | ||||
| @@ -50,43 +51,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro INIT | .macro INIT | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||||
| fneg s2, DA_I | |||||
| ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | |||||
| ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I | |||||
| ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||||
| #else | #else | ||||
| ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | ||||
| fneg d2, DA_I | |||||
| ins v1.d[1], v2.d[0] // v1 = DA_I, DA_I | |||||
| ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I | |||||
| #endif | #endif | ||||
| .endm | .endm | ||||
| .macro KERNEL_F1 | .macro KERNEL_F1 | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ld1 {v2.2s}, [X] // X1, X0 | ld1 {v2.2s}, [X] // X1, X0 | ||||
| ext v3.8b, v2.8b, v2.8b, #4 // X0, X1 | |||||
| fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 | |||||
| fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
| st1 {v2.2s}, [X], #8 | |||||
| fmul s3, DA_R, v2.s[0] // DA_R*X0 | |||||
| fmul s5, DA_I, v2.s[1] // DA_I*X1 | |||||
| fsub s3, s3, s5 // DA_R*X0-DA_I*X1 | |||||
| fmul s4, DA_I, v2.s[0] // DA_I*X0 | |||||
| fmul s5, DA_R, v2.s[1] // DA_R*X1 | |||||
| fadd s4, s4, s5 // DA_I*X0+DA_R*X1 | |||||
| ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
| st1 {v3.2s}, [X], #8 | |||||
| #else | #else | ||||
| ld1 {v2.2d}, [X] // X1, X0 | ld1 {v2.2d}, [X] // X1, X0 | ||||
| ext v3.16b, v2.16b, v2.16b, #8 // X0, X1 | |||||
| fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 | |||||
| fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
| st1 {v2.2d}, [X], #16 | |||||
| #endif | |||||
| fmul d3, DA_R, v2.d[0] // DA_R*X0 | |||||
| fmul d5, DA_I, v2.d[1] // DA_I*X1 | |||||
| fsub d3, d3, d5 // DA_R*X0-DA_I*X1 | |||||
| fmul d4, DA_I, v2.d[0] // DA_I*X0 | |||||
| fmul d5, DA_R, v2.d[1] // DA_R*X1 | |||||
| fadd d4, d4, d5 // DA_I*X0+DA_R*X1 | |||||
| ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
| st1 {v3.2d}, [X], #16 | |||||
| #endif | |||||
| .endm | .endm | ||||
| .macro KERNEL_INIT_F4 | .macro KERNEL_INIT_F4 | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| // Replicate the lower 2 floats into the upper 2 slots | |||||
| ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R | |||||
| ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I | |||||
| ins v16.s[0], v0.s[0] | |||||
| ins v16.s[1], v16.s[0] | |||||
| ins v16.d[1], v16.d[0] | |||||
| ins v17.s[0], v1.s[0] | |||||
| ins v17.s[1], v17.s[0] | |||||
| ins v17.d[1], v17.d[0] | |||||
| #else //DOUBLE | |||||
| ins v16.d[0], v0.d[0] | |||||
| ins v16.d[1], v16.d[0] | |||||
| ins v17.d[0], v1.d[0] | |||||
| ins v17.d[1], v17.d[0] | |||||
| #endif | #endif | ||||
| .endm | .endm | ||||
| @@ -94,46 +107,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro KERNEL_F4 | .macro KERNEL_F4 | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ld1 {v2.4s,v3.4s}, [X] // V2 = X[3], X[2], X[1], X[0] | |||||
| // V3 = X[7], X[6], X[5], X[4] | |||||
| ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1] | |||||
| ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1] | |||||
| ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1] | |||||
| fmul v2.4s, v0.4s, v2.4s // X'[ix] += DA_R * X[ix] | |||||
| // X'[ix+1] += DA_R * X[ix+1] | |||||
| fmla v2.4s, v1.4s, v6.4s // X'[ix] += -DA_I * X[ix+1] | |||||
| // X'[ix+1] += DA_I * X[ix] | |||||
| ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5] | |||||
| ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5] | |||||
| ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5] | |||||
| fmul v3.4s, v0.4s, v3.4s // X'[ix] += DA_R * X[ix] | |||||
| // X'[ix+1] += DA_R * X[ix+1] | |||||
| fmla v3.4s, v1.4s, v7.4s // X'[ix] += -DA_I * X[ix+1] | |||||
| // X'[ix+1] += DA_I * X[ix] | |||||
| st1 {v2.4s,v3.4s}, [X], #32 | |||||
| ld2 {v2.4s, v3.4s}, [X], #32 | |||||
| fmul v4.4s, v2.4s, v16.4s | |||||
| fmul v6.4s, v3.4s, v17.4s | |||||
| fsub v4.4s, v4.4s, v6.4s | |||||
| fmul v5.4s, v2.4s, v17.4s | |||||
| fmul v6.4s, v3.4s, v16.4s | |||||
| fadd v5.4s, v5.4s, v6.4s | |||||
| st2 {v4.4s, v5.4s}, [X_COPY], #32 | |||||
| #else // DOUBLE | #else // DOUBLE | ||||
| ld1 {v2.2d,v3.2d,v4.2d,v5.2d}, [X] // CX0, CX1, CX2, CX3 | |||||
| ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1] | |||||
| ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1] | |||||
| ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1] | |||||
| ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1] | |||||
| ld2 {v2.2d, v3.2d}, [X], #32 | |||||
| fmul v2.2d, v0.2d, v2.2d | |||||
| fmla v2.2d, v1.2d, v20.2d | |||||
| fmul v4.2d, v2.2d, v16.2d | |||||
| fmul v6.2d, v3.2d, v17.2d | |||||
| fsub v4.2d, v4.2d, v6.2d | |||||
| fmul v5.2d, v2.2d, v17.2d | |||||
| fmul v6.2d, v3.2d, v16.2d | |||||
| fadd v5.2d, v5.2d, v6.2d | |||||
| fmul v3.2d, v0.2d, v3.2d | |||||
| fmla v3.2d, v1.2d, v21.2d | |||||
| st1 {v2.2d,v3.2d}, [X], #32 | |||||
| st2 {v4.2d, v5.2d}, [X_COPY], #32 | |||||
| fmul v4.2d, v0.2d, v4.2d | |||||
| fmla v4.2d, v1.2d, v22.2d | |||||
| ld2 {v18.2d, v19.2d}, [X], #32 | |||||
| fmul v5.2d, v0.2d, v5.2d | |||||
| fmla v5.2d, v1.2d, v23.2d | |||||
| st1 {v4.2d,v5.2d}, [X], #32 | |||||
| fmul v20.2d, v18.2d, v16.2d | |||||
| fmul v6.2d, v19.2d, v17.2d | |||||
| fsub v20.2d, v20.2d, v6.2d | |||||
| fmul v21.2d, v18.2d, v17.2d | |||||
| fmul v6.2d, v19.2d, v16.2d | |||||
| fadd v21.2d, v21.2d, v6.2d | |||||
| st2 {v20.2d, v21.2d}, [X_COPY], #32 | |||||
| #endif | #endif | ||||
| PRFM PLDL1KEEP, [X, #1024] | PRFM PLDL1KEEP, [X, #1024] | ||||
| .endm | .endm | ||||
| @@ -149,21 +155,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro KERNEL_S1 | .macro KERNEL_S1 | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| ld1 {v2.2s}, [X] // X1, X0 | ld1 {v2.2s}, [X] // X1, X0 | ||||
| ext v3.8b, v2.8b, v2.8b, #4 // X0, X1 | |||||
| fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 | |||||
| fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
| st1 {v2.2s}, [X], INC_X | |||||
| fmul s3, DA_R, v2.s[0] // DA_R*X0 | |||||
| fmul s5, DA_I, v2.s[1] // DA_I*X1 | |||||
| fsub s3, s3, s5 // DA_R*X0-DA_I*X1 | |||||
| fmul s4, DA_I, v2.s[0] // DA_I*X0 | |||||
| fmul s5, DA_R, v2.s[1] // DA_R*X1 | |||||
| fadd s4, s4, s5 // DA_I*X0+DA_R*X1 | |||||
| ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
| st1 {v3.2s}, [X], INC_X | |||||
| #else | #else | ||||
| ld1 {v2.2d}, [X] // X1, X0 | ld1 {v2.2d}, [X] // X1, X0 | ||||
| ext v3.16b, v2.16b, v2.16b, #8 // X0, X1 | |||||
| fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 | |||||
| fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
| st1 {v2.2d}, [X], INC_X | |||||
| #endif | |||||
| fmul d3, DA_R, v2.d[0] // DA_R*X0 | |||||
| fmul d5, DA_I, v2.d[1] // DA_I*X1 | |||||
| fsub d3, d3, d5 // DA_R*X0-DA_I*X1 | |||||
| fmul d4, DA_I, v2.d[0] // DA_I*X0 | |||||
| fmul d5, DA_R, v2.d[1] // DA_R*X1 | |||||
| fadd d4, d4, d5 // DA_I*X0+DA_R*X1 | |||||
| ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
| st1 {v3.2d}, [X], INC_X | |||||
| #endif | |||||
| .endm | .endm | ||||
| /******************************************************************************* | /******************************************************************************* | ||||
| @@ -171,21 +187,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | *******************************************************************************/ | ||||
| PROLOGUE | PROLOGUE | ||||
| b zscal_begin | |||||
| data_ar: | |||||
| .word 0x3e44fae6 | |||||
| data_ai: | |||||
| .word 0x3d320fa2 | |||||
| data_xr: | |||||
| .word 0x3f4baff1 | |||||
| data_xi: | |||||
| .word 0xbe8ef0bd | |||||
| zscal_begin: | |||||
| ldr s20, data_ar | |||||
| ldr s21, data_ai | |||||
| ldr s22, data_xr | |||||
| ldr s23, data_xi | |||||
| fmul s24, s22, s21 | |||||
| fmla s24, s23, v20.s[0] | |||||
| fmul s25, s22, s21 | |||||
| fmul s26, s23, s20 | |||||
| fadd s25, s25, s26 | |||||
| mov X_COPY, X | |||||
| cmp N, xzr | cmp N, xzr | ||||
| ble zscal_kernel_L999 | ble zscal_kernel_L999 | ||||
| fcmp DA_R, #0.0 | fcmp DA_R, #0.0 | ||||
| bne zscal_kernel_1 | |||||
| bne zscal_kernel_R_non_zero | |||||
| fcmp DA_I, #0.0 | fcmp DA_I, #0.0 | ||||
| beq zscal_kernel_zero | |||||
| beq zscal_kernel_RI_zero | |||||
| // TODO: special case DA_R == 0 && DA_I != 0 | |||||
| b zscal_kernel_R_zero | |||||
| zscal_kernel_1: | |||||
| zscal_kernel_R_non_zero: | |||||
| // TODO: special case DA_R != 0 && DA_I == 0 | |||||
| fcmp DA_I, #0.0 | |||||
| beq zscal_kernel_I_zero | |||||
| /******************************************************************************* | |||||
| * A_R != 0 && A_I != 0 | |||||
| *******************************************************************************/ | |||||
| zscal_kernel_RI_non_zero: | |||||
| INIT | INIT | ||||
| @@ -257,16 +306,85 @@ zscal_kernel_L999: | |||||
| mov w0, wzr | mov w0, wzr | ||||
| ret | ret | ||||
| zscal_kernel_zero: | |||||
| /******************************************************************************* | |||||
| * A_R == 0 && A_I != 0 | |||||
| *******************************************************************************/ | |||||
| zscal_kernel_R_zero: | |||||
| INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub s2, s2, DA_I | |||||
| ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | |||||
| #else | |||||
| eor v2.16b, v2.16b, v2.16b | |||||
| fsub d2, d2, DA_I | |||||
| ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I | |||||
| #endif | |||||
| zscal_kernel_R_zero_1: | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.2s}, [X] // X1, X0 | |||||
| fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 | |||||
| ext v2.8b, v2.8b, v2.8b, #4 // DA_I*X0, -DA_I*X1 | |||||
| st1 {v2.2s}, [X] | |||||
| #else | |||||
| ld1 {v2.2d}, [X] // X1, X0 | |||||
| fmul v2.2d, v2.2d, v1.2d // -DA_I*X1, DA_I*X0 | |||||
| ext v2.16b, v2.16b, v2.16b, #8 // DA_I*X0, -DA_I*X1 | |||||
| st1 {v2.2d}, [X] | |||||
| #endif | |||||
| add X, X, INC_X | |||||
| subs N, N, #1 | |||||
| bne zscal_kernel_R_zero_1 | |||||
| mov w0, wzr | |||||
| ret | |||||
| /******************************************************************************* | |||||
| * A_R != 0 && A_I == 0 | |||||
| *******************************************************************************/ | |||||
| zscal_kernel_I_zero: | |||||
| INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||||
| #else | |||||
| ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | |||||
| #endif | |||||
| zscal_kernel_I_zero_1: | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.2s}, [X] // X1, X0 | |||||
| fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 | |||||
| st1 {v2.2s}, [X] | |||||
| #else | |||||
| ld1 {v2.2d}, [X] // X1, X0 | |||||
| fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 | |||||
| st1 {v2.2d}, [X] | |||||
| #endif | |||||
| add X, X, INC_X | |||||
| subs N, N, #1 | |||||
| bne zscal_kernel_I_zero_1 | |||||
| mov w0, wzr | |||||
| ret | |||||
| /******************************************************************************* | |||||
| * A_R == 0 && A_I == 0 | |||||
| *******************************************************************************/ | |||||
| zscal_kernel_RI_zero: | |||||
| INIT_S | INIT_S | ||||
| zscal_kernel_Z1: | |||||
| zscal_kernel_RI_zero_1: | |||||
| stp DA_R, DA_I, [X] | stp DA_R, DA_I, [X] | ||||
| add X, X, INC_X | add X, X, INC_X | ||||
| subs N, N, #1 | |||||
| bne zscal_kernel_Z1 | |||||
| subs N, N, #1 | |||||
| bne zscal_kernel_RI_zero_1 | |||||
| mov w0, wzr | mov w0, wzr | ||||
| ret | ret | ||||
| @@ -187,73 +187,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmul v16.2d, v0.2d, v8.2d[0] | fmul v16.2d, v0.2d, v8.2d[0] | ||||
| OP_ii v16.2d, v1.2d, v9.2d[0] | OP_ii v16.2d, v1.2d, v9.2d[0] | ||||
| fmul v17.2d, v0.2d, v9.2d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v17.2d, v17.2d | |||||
| eor v17.16b, v17.16b, v17.16b | |||||
| fmls v17.2d, v0.2d, v9.2d[0] | |||||
| #else | |||||
| fmul v17.2d, v0.2d, v9.2d[0] | |||||
| #endif | #endif | ||||
| OP_ir v17.2d, v1.2d, v8.2d[0] | OP_ir v17.2d, v1.2d, v8.2d[0] | ||||
| fmul v18.2d, v2.2d, v8.2d[0] | fmul v18.2d, v2.2d, v8.2d[0] | ||||
| OP_ii v18.2d, v3.2d, v9.2d[0] | OP_ii v18.2d, v3.2d, v9.2d[0] | ||||
| fmul v19.2d, v2.2d, v9.2d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v19.2d, v19.2d | |||||
| eor v19.16b, v19.16b, v19.16b | |||||
| fmls v19.2d, v2.2d, v9.2d[0] | |||||
| #else | |||||
| fmul v19.2d, v2.2d, v9.2d[0] | |||||
| #endif | #endif | ||||
| OP_ir v19.2d, v3.2d, v8.2d[0] | OP_ir v19.2d, v3.2d, v8.2d[0] | ||||
| fmul v20.2d, v0.2d, v8.2d[1] | fmul v20.2d, v0.2d, v8.2d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.2d[1] | OP_ii v20.2d, v1.2d, v9.2d[1] | ||||
| fmul v21.2d, v0.2d, v9.2d[1] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v21.2d, v21.2d | |||||
| eor v21.16b, v21.16b, v21.16b | |||||
| fmls v21.2d, v0.2d, v9.2d[1] | |||||
| #else | |||||
| fmul v21.2d, v0.2d, v9.2d[1] | |||||
| #endif | #endif | ||||
| OP_ir v21.2d, v1.2d, v8.2d[1] | OP_ir v21.2d, v1.2d, v8.2d[1] | ||||
| fmul v22.2d, v2.2d, v8.2d[1] | fmul v22.2d, v2.2d, v8.2d[1] | ||||
| OP_ii v22.2d, v3.2d, v9.2d[1] | OP_ii v22.2d, v3.2d, v9.2d[1] | ||||
| fmul v23.2d, v2.2d, v9.2d[1] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v23.2d, v23.2d | |||||
| eor v23.16b, v23.16b, v23.16b | |||||
| fmls v23.2d, v2.2d, v9.2d[1] | |||||
| #else | |||||
| fmul v23.2d, v2.2d, v9.2d[1] | |||||
| #endif | #endif | ||||
| OP_ir v23.2d, v3.2d, v8.2d[1] | OP_ir v23.2d, v3.2d, v8.2d[1] | ||||
| fmul v24.2d, v0.2d, v10.2d[0] | fmul v24.2d, v0.2d, v10.2d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.2d[0] | OP_ii v24.2d, v1.2d, v11.2d[0] | ||||
| fmul v25.2d, v0.2d, v11.2d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v25.2d, v25.2d | |||||
| eor v25.16b, v25.16b, v25.16b | |||||
| fmls v25.2d, v0.2d, v11.2d[0] | |||||
| #else | |||||
| fmul v25.2d, v0.2d, v11.2d[0] | |||||
| #endif | #endif | ||||
| OP_ir v25.2d, v1.2d, v10.2d[0] | OP_ir v25.2d, v1.2d, v10.2d[0] | ||||
| fmul v26.2d, v2.2d, v10.2d[0] | fmul v26.2d, v2.2d, v10.2d[0] | ||||
| OP_ii v26.2d, v3.2d, v11.2d[0] | OP_ii v26.2d, v3.2d, v11.2d[0] | ||||
| fmul v27.2d, v2.2d, v11.2d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v27.2d, v27.2d | |||||
| eor v27.16b, v27.16b, v27.16b | |||||
| fmls v27.2d, v2.2d, v11.2d[0] | |||||
| #else | |||||
| fmul v27.2d, v2.2d, v11.2d[0] | |||||
| #endif | #endif | ||||
| OP_ir v27.2d, v3.2d, v10.2d[0] | OP_ir v27.2d, v3.2d, v10.2d[0] | ||||
| fmul v28.2d, v0.2d, v10.2d[1] | fmul v28.2d, v0.2d, v10.2d[1] | ||||
| OP_ii v28.2d, v1.2d, v11.2d[1] | OP_ii v28.2d, v1.2d, v11.2d[1] | ||||
| fmul v29.2d, v0.2d, v11.2d[1] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v29.2d, v29.2d | |||||
| eor v29.16b, v29.16b, v29.16b | |||||
| fmls v29.2d, v0.2d, v11.2d[1] | |||||
| #else | |||||
| fmul v29.2d, v0.2d, v11.2d[1] | |||||
| #endif | #endif | ||||
| OP_ir v29.2d, v1.2d, v10.2d[1] | OP_ir v29.2d, v1.2d, v10.2d[1] | ||||
| fmul v30.2d, v2.2d, v10.2d[1] | fmul v30.2d, v2.2d, v10.2d[1] | ||||
| OP_ii v30.2d, v3.2d, v11.2d[1] | OP_ii v30.2d, v3.2d, v11.2d[1] | ||||
| fmul v31.2d, v2.2d, v11.2d[1] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| fneg v31.2d, v31.2d | |||||
| eor v31.16b, v31.16b, v31.16b | |||||
| fmls v31.2d, v2.2d, v11.2d[1] | |||||
| #else | |||||
| fmul v31.2d, v2.2d, v11.2d[1] | |||||
| #endif | #endif | ||||
| OP_ir v31.2d, v3.2d, v10.2d[1] | OP_ir v31.2d, v3.2d, v10.2d[1] | ||||