Browse Source

Improvements to GEMV kernels

tags/v0.2.19^2
Ashwin Sekhar T K 9 years ago
parent
commit
8a40f1355e
4 changed files with 128 additions and 156 deletions
  1. +9
    -0
      kernel/arm64/gemv_n.S
  2. +16
    -1
      kernel/arm64/gemv_t.S
  3. +96
    -153
      kernel/arm64/zgemv_n.S
  4. +7
    -2
      kernel/arm64/zgemv_t.S

+ 9
- 0
kernel/arm64/gemv_n.S View File

@@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SHZ 3
#endif

#define A_PRE_SIZE 768
#define Y_PRE_SIZE 768

/******************************************************************************/

.macro SAVE_REGS
@@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v2.4s, v3.4s}, [A_PTR], #32
ld1 {v4.4s, v5.4s}, [Y_IPTR], #32
fmla v4.4s, v1.4s, v2.4s
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v5.4s, v1.4s, v3.4s
st1 {v4.4s, v5.4s}, [Y_OPTR], #32

ld1 {v6.4s, v7.4s}, [A_PTR], #32
ld1 {v8.4s, v9.4s}, [Y_IPTR], #32
fmla v8.4s, v1.4s, v6.4s
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v9.4s, v1.4s, v7.4s
st1 {v8.4s, v9.4s}, [Y_OPTR], #32
#else //DOUBLE
ld1 {v2.2d, v3.2d}, [A_PTR], #32
ld1 {v4.2d, v5.2d}, [Y_IPTR], #32
fmla v4.2d, v1.2d, v2.2d
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v5.2d, v1.2d, v3.2d
st1 {v4.2d, v5.2d}, [Y_OPTR], #32

ld1 {v6.2d, v7.2d}, [A_PTR], #32
ld1 {v8.2d, v9.2d}, [Y_IPTR], #32
fmla v8.2d, v1.2d, v6.2d
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v9.2d, v1.2d, v7.2d
st1 {v8.2d, v9.2d}, [Y_OPTR], #32

ld1 {v10.2d, v11.2d}, [A_PTR], #32
ld1 {v12.2d, v13.2d}, [Y_IPTR], #32
fmla v12.2d, v1.2d, v10.2d
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
fmla v13.2d, v1.2d, v11.2d
st1 {v12.2d, v13.2d}, [Y_OPTR], #32

ld1 {v14.2d, v15.2d}, [A_PTR], #32
ld1 {v16.2d, v17.2d}, [Y_IPTR], #32
fmla v16.2d, v1.2d, v14.2d
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
fmla v17.2d, v1.2d, v15.2d
st1 {v16.2d, v17.2d}, [Y_OPTR], #32
#endif


+ 16
- 1
kernel/arm64/gemv_t.S View File

@@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define J x11 /* loop variable */
#define I x12 /* loop variable */

#define X_PREFETCH_SIZE 768
#define A_PREFETCH_SIZE 768

/*******************************************************************************
* Macro definitions
*******************************************************************************/
@@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
fmla v1.4s, v5.4s, v9.4s
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.4s, v6.4s, v10.4s
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.4s, v7.4s, v11.4s
ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
fmla v4.4s, v8.4s, v12.4s

ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
fmla v1.4s, v13.4s, v17.4s
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.4s, v14.4s, v18.4s
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.4s, v15.4s, v19.4s
fmla v4.4s, v16.4s, v20.4s
#else
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
fmla v1.2d, v5.2d, v9.2d
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v6.2d, v10.2d
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v7.2d, v11.2d
fmla v4.2d, v8.2d, v12.2d

ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
fmla v1.2d, v13.2d, v17.2d
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v14.2d, v18.2d
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v15.2d, v19.2d
fmla v4.2d, v16.2d, v20.2d

ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
fmla v1.2d, v5.2d, v9.2d
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v6.2d, v10.2d
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v7.2d, v11.2d
fmla v4.2d, v8.2d, v12.2d

ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
fmla v1.2d, v13.2d, v17.2d
prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
fmla v2.2d, v14.2d, v18.2d
prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
fmla v3.2d, v15.2d, v19.2d
fmla v4.2d, v16.2d, v20.2d
#endif


+ 96
- 153
kernel/arm64/zgemv_n.S View File

@@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define Y_OPTR x13 /* loop Y vector address */
#define X_PTR x14 /* loop X vector address */

#define A_PRE_SIZE 768
#define Y_PRE_SIZE 768

/*******************************************************************************
* Macro definitions
*******************************************************************************/
@@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DOUBLE)
#define ALPHA_R s0
#define ALPHA_I s1
#define ALPHA_R_COPY s7
#define ALPHA_I_COPY s8
#define SHZ 3
#else
#define ALPHA_R d0
#define ALPHA_I d1
#define ALPHA_R_COPY d7
#define ALPHA_I_COPY d8
#define SHZ 4
#endif

@@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


.macro INIT
/********** INIT FOR F4 LOOP **********/
fmov ALPHA_R_COPY, ALPHA_R
fmov ALPHA_I_COPY, ALPHA_I
#if !defined(DOUBLE)
ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA)
ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA)
ins v7.d[1], v7.d[0]
ins v8.d[1], v8.d[0]
#else
ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA)
ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA)
#endif

/******* INIT FOR F1 AND S1 LOOP ******/
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
eor v2.16b, v2.16b, v2.16b
@@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm

.macro INIT_LOOP
/********** INIT_LOOP FOR F4 LOOP **********/
#if !defined(DOUBLE)
ld1 {v9.2s}, [X_PTR] // [I(X), R(X)]
ins v10.s[0], v9.s[1]
ins v9.s[1], v9.s[0] // [R(X), R(X)]
ins v10.s[1], v10.s[0] // [I(X), I(X)]
ins v9.d[1], v9.d[0]
ins v10.d[1], v10.d[0]
ld1 {v2.2s}, [X_PTR] // [I(X), R(X)]
ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)]
fmul v2.2s, v0.2s, v2.2s
fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)]
ins v3.s[0], v2.s[1]

/********** INIT_LOOP FOR F4 LOOP **********/
#if !defined(CONJ)
#if !defined(XCONJ)
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
dup v21.4s, v2.s[0] // R[TEMP]
dup v22.4s, v2.s[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s3
dup v23.4s, v25.s[0] // -I[TEMP]
dup v24.4s, v3.s[0] // I[TEMP]
#else
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
dup v21.4s, v2.s[0] // R[TEMP]
dup v22.4s, v2.s[0] // R[TEMP]
dup v23.4s, v3.s[0] // I[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s3
dup v24.4s, v25.s[0] // -I[TEMP]
#endif
#else // CONJ
#if !defined(XCONJ)
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
dup v21.4s, v2.s[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s2
dup v22.4s, v25.s[0] // R[TEMP]
dup v23.4s, v3.s[0] // I[TEMP]
dup v24.4s, v3.s[0] // I[TEMP]
#else
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
eor v12.16b, v12.16b, v12.16b
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
dup v21.4s, v2.s[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s2
dup v22.4s, v25.s[0] // R[TEMP]

eor v25.16b, v25.16b, v25.16b
fsub s25, s25, s3
dup v23.4s, v25.s[0] // I[TEMP]
dup v24.4s, v25.s[0] // I[TEMP]
#endif
#endif // CONJ


/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
ld1 {v2.2s}, [X_PTR] // [I(X), R(X)]
ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)]
fmul v2.2s, v0.2s, v2.2s
fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)]
ins v3.s[0], v2.s[1]
#if !defined(CONJ)
#if !defined(XCONJ)
eor v4.16b, v4.16b, v4.16b
@@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif // CONJ

#else // DOUBLE
ld1 {v2.2d}, [X_PTR] // [I(X), R(X)]
ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)]
fmul v2.2d, v0.2d, v2.2d
fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)]
ins v3.d[0], v2.d[1] // I(TEMP)

/********** INIT_LOOP FOR F4 LOOP **********/
ld1 {v9.2d}, [X_PTR] // [I(X), R(X)]
ins v10.d[0], v9.d[1]
ins v9.d[1], v9.d[0] // [R(X), R(X)]
ins v10.d[1], v10.d[0] // [I(X), I(X)]
/****** INIT_LOOP FOR F4 LOOP ******/
#if !defined(CONJ)
#if !defined(XCONJ)
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
dup v21.2d, v2.d[0] // R[TEMP]
dup v22.2d, v2.d[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d3
dup v23.2d, v25.d[0] // -I[TEMP]
dup v24.2d, v3.d[0] // I[TEMP]
#else
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
dup v21.2d, v2.d[0] // R[TEMP]
dup v22.2d, v2.d[0] // R[TEMP]
dup v23.2d, v3.d[0] // I[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d3
dup v24.2d, v25.d[0] // -I[TEMP]
#endif
#else // CONJ
#if !defined(XCONJ)
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
dup v21.2d, v2.d[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d2
dup v22.2d, v25.d[0] // R[TEMP]
dup v23.2d, v3.d[0] // I[TEMP]
dup v24.2d, v3.d[0] // I[TEMP]
#else
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
eor v12.16b, v12.16b, v12.16b
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
dup v21.2d, v2.d[0] // R[TEMP]
eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d2
dup v22.2d, v25.d[0] // R[TEMP]

eor v25.16b, v25.16b, v25.16b
fsub d25, d25, d3
dup v23.2d, v25.d[0] // I[TEMP]
dup v24.2d, v25.d[0] // I[TEMP]
#endif
#endif // CONJ


/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
ld1 {v2.2d}, [X_PTR] // [I(X), R(X)]
ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)]
fmul v2.2d, v0.2d, v2.2d
fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)]
ins v3.d[0], v2.d[1] // I(TEMP)
#if !defined(CONJ)
#if !defined(XCONJ)
eor v4.16b, v4.16b, v4.16b
@@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

ld2 {v13.4s, v14.4s}, [A_PTR], #32
ld2 {v15.4s, v16.4s}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
#else
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
#else
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
#endif
#endif // CONJ

prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]

fmla v15.4s, v21.4s, v13.4s
fmla v15.4s, v23.4s, v14.4s
fmla v16.4s, v22.4s, v14.4s
fmla v16.4s, v24.4s, v13.4s

st2 {v15.4s, v16.4s}, [Y_OPTR], #32

#else // DOUBLE

ld2 {v13.2d, v14.2d}, [A_PTR], #32
ld2 {v15.2d, v16.2d}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
#endif
#endif // CONJ
prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE]

fmla v15.2d, v21.2d, v13.2d
fmla v15.2d, v23.2d, v14.2d
fmla v16.2d, v22.2d, v14.2d
fmla v16.2d, v24.2d, v13.2d

st2 {v15.2d, v16.2d}, [Y_OPTR], #32

ld2 {v17.2d, v18.2d}, [A_PTR], #32
ld2 {v19.2d, v20.2d}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#endif
#endif // CONJ
prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]

fmla v19.2d, v21.2d, v17.2d
fmla v19.2d, v23.2d, v18.2d
fmla v20.2d, v22.2d, v18.2d
fmla v20.2d, v24.2d, v17.2d

st2 {v19.2d, v20.2d}, [Y_OPTR], #32

#endif
@@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP:

zgemv_n_kernel_F4:

KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F4

subs I, I, #1
bne zgemv_n_kernel_F4


+ 7
- 2
kernel/arm64/zgemv_t.S View File

@@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define J x11 /* loop variable */
#define I x12 /* loop variable */

#define A_PRE_SIZE 768
#define X_PRE_SIZE 768

/*******************************************************************************
* Macro definitions
*******************************************************************************/
@@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

ld2 {v11.4s, v12.4s}, [X_PTR], #32
ld2 {v13.4s, v14.4s}, [A_PTR], #32
prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]

#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
@@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else // DOUBLE
ld2 {v11.2d, v12.2d}, [X_PTR], #32
ld2 {v13.2d, v14.2d}, [A_PTR], #32
prfm PLDL1STRM, [X_PTR, #512]
prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]

#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
@@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

ld2 {v17.2d, v18.2d}, [X_PTR], #32
ld2 {v19.2d, v20.2d}, [A_PTR], #32
prfm PLDL1STRM, [A_PTR, #512]
prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]

#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]


Loading…
Cancel
Save