Browse Source

ARM64: Convert all labels to local labels

While debugging/profiling applications using perf or other tools, the
kernels appear scattered in the profile reports. This is because the labels
within the kernels are not local and each label is shown as a separate
function.

To avoid this, all the labels within the kernels are changed to local
labels.
tags/v0.3.0
Ashwin Sekhar T K 8 years ago
parent
commit
a0128aa489
50 changed files with 4469 additions and 4469 deletions
  1. +25
    -25
      kernel/arm64/amax.S
  2. +20
    -20
      kernel/arm64/asum.S
  3. +21
    -21
      kernel/arm64/axpy.S
  4. +20
    -20
      kernel/arm64/casum.S
  5. +142
    -142
      kernel/arm64/cgemm_kernel_4x4.S
  6. +175
    -175
      kernel/arm64/cgemm_kernel_8x4.S
  7. +175
    -175
      kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
  8. +20
    -20
      kernel/arm64/copy.S
  9. +129
    -129
      kernel/arm64/ctrmm_kernel_4x4.S
  10. +175
    -175
      kernel/arm64/ctrmm_kernel_8x4.S
  11. +22
    -22
      kernel/arm64/daxpy_thunderx2t99.S
  12. +143
    -143
      kernel/arm64/dgemm_kernel_4x4.S
  13. +176
    -176
      kernel/arm64/dgemm_kernel_4x8.S
  14. +169
    -169
      kernel/arm64/dgemm_kernel_8x4.S
  15. +169
    -169
      kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S
  16. +36
    -36
      kernel/arm64/dgemm_ncopy_4.S
  17. +48
    -48
      kernel/arm64/dgemm_ncopy_8.S
  18. +36
    -36
      kernel/arm64/dgemm_tcopy_4.S
  19. +56
    -56
      kernel/arm64/dgemm_tcopy_8.S
  20. +20
    -20
      kernel/arm64/dot.S
  21. +129
    -129
      kernel/arm64/dtrmm_kernel_4x4.S
  22. +176
    -176
      kernel/arm64/dtrmm_kernel_4x8.S
  23. +169
    -169
      kernel/arm64/dtrmm_kernel_8x4.S
  24. +31
    -31
      kernel/arm64/gemv_n.S
  25. +31
    -31
      kernel/arm64/gemv_t.S
  26. +24
    -24
      kernel/arm64/iamax.S
  27. +24
    -24
      kernel/arm64/izamax.S
  28. +16
    -16
      kernel/arm64/nrm2.S
  29. +20
    -20
      kernel/arm64/rot.S
  30. +23
    -23
      kernel/arm64/scal.S
  31. +221
    -221
      kernel/arm64/sgemm_kernel_16x4.S
  32. +221
    -221
      kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S
  33. +155
    -155
      kernel/arm64/sgemm_kernel_4x4.S
  34. +241
    -241
      kernel/arm64/sgemm_kernel_8x8.S
  35. +221
    -221
      kernel/arm64/strmm_kernel_16x4.S
  36. +130
    -130
      kernel/arm64/strmm_kernel_4x4.S
  37. +241
    -241
      kernel/arm64/strmm_kernel_8x8.S
  38. +21
    -21
      kernel/arm64/swap.S
  39. +25
    -25
      kernel/arm64/zamax.S
  40. +20
    -20
      kernel/arm64/zasum.S
  41. +21
    -21
      kernel/arm64/zaxpy.S
  42. +20
    -20
      kernel/arm64/zdot.S
  43. +130
    -130
      kernel/arm64/zgemm_kernel_4x4.S
  44. +130
    -130
      kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S
  45. +26
    -26
      kernel/arm64/zgemv_n.S
  46. +26
    -26
      kernel/arm64/zgemv_t.S
  47. +16
    -16
      kernel/arm64/znrm2.S
  48. +20
    -20
      kernel/arm64/zrot.S
  49. +34
    -34
      kernel/arm64/zscal.S
  50. +130
    -130
      kernel/arm64/ztrmm_kernel_4x4.S

+ 25
- 25
kernel/arm64/amax.S View File

@@ -160,62 +160,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

cmp N, xzr
ble amax_kernel_zero
ble .Lamax_kernel_zero
cmp INC_X, xzr
ble amax_kernel_zero
ble .Lamax_kernel_zero

cmp INC_X, #1
bne amax_kernel_S_BEGIN
bne .Lamax_kernel_S_BEGIN

amax_kernel_F_BEGIN:
.Lamax_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq amax_kernel_F1_INIT
beq .Lamax_kernel_F1_INIT

INIT_F4
subs I, I, #1
beq amax_kernel_F1
beq .Lamax_kernel_F1

amax_kernel_F4:
.Lamax_kernel_F4:

KERNEL_F4

subs I, I, #1
bne amax_kernel_F4
bne .Lamax_kernel_F4

amax_kernel_F1:
.Lamax_kernel_F1:

ands I, N, #3
ble amax_kernel_L999
ble .Lamax_kernel_L999

amax_kernel_F10:
.Lamax_kernel_F10:

KERNEL_F1

subs I, I, #1
bne amax_kernel_F10
bne .Lamax_kernel_F10

ret

amax_kernel_F1_INIT:
.Lamax_kernel_F1_INIT:

INIT_F1
subs N, N, #1
b amax_kernel_F1
b .Lamax_kernel_F1

amax_kernel_S_BEGIN:
.Lamax_kernel_S_BEGIN:

INIT_S

subs N, N, #1
ble amax_kernel_L999
ble .Lamax_kernel_L999

asr I, N, #2
cmp I, xzr
ble amax_kernel_S1
ble .Lamax_kernel_S1

amax_kernel_S4:
.Lamax_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -223,25 +223,25 @@ amax_kernel_S4:
KERNEL_S1

subs I, I, #1
bne amax_kernel_S4
bne .Lamax_kernel_S4

amax_kernel_S1:
.Lamax_kernel_S1:

ands I, N, #3
ble amax_kernel_L999
ble .Lamax_kernel_L999

amax_kernel_S10:
.Lamax_kernel_S10:

KERNEL_S1

subs I, I, #1
bne amax_kernel_S10
bne .Lamax_kernel_S10

amax_kernel_L999:
.Lamax_kernel_L999:

ret

amax_kernel_zero:
.Lamax_kernel_zero:

fmov MAXF, REG0
ret


+ 20
- 20
kernel/arm64/asum.S View File

@@ -122,52 +122,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif

cmp N, xzr
ble asum_kernel_L999
ble .Lasum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999
ble .Lasum_kernel_L999

cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lasum_kernel_S_BEGIN

asum_kernel_F_BEGIN:
.Lasum_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq asum_kernel_F1
beq .Lasum_kernel_F1

asum_kernel_F8:
.Lasum_kernel_F8:

KERNEL_F8

subs I, I, #1
bne asum_kernel_F8
bne .Lasum_kernel_F8

KERNEL_F8_FINALIZE

asum_kernel_F1:
.Lasum_kernel_F1:

ands I, N, #7
ble asum_kernel_L999
ble .Lasum_kernel_L999

asum_kernel_F10:
.Lasum_kernel_F10:

KERNEL_F1

subs I, I, #1
bne asum_kernel_F10
bne .Lasum_kernel_F10

asum_kernel_L999:
.Lasum_kernel_L999:
ret

asum_kernel_S_BEGIN:
.Lasum_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble asum_kernel_S1
ble .Lasum_kernel_S1

asum_kernel_S4:
.Lasum_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -175,19 +175,19 @@ asum_kernel_S4:
KERNEL_S1

subs I, I, #1
bne asum_kernel_S4
bne .Lasum_kernel_S4

asum_kernel_S1:
.Lasum_kernel_S1:

ands I, N, #3
ble asum_kernel_L999
ble .Lasum_kernel_L999

asum_kernel_S10:
.Lasum_kernel_S10:

KERNEL_S1

subs I, I, #1
bne asum_kernel_S10
bne .Lasum_kernel_S10

ret



+ 21
- 21
kernel/arm64/axpy.S View File

@@ -135,53 +135,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

cmp N, xzr
ble axpy_kernel_L999
ble .Laxpy_kernel_L999

fcmp DA, #0.0
beq axpy_kernel_L999
beq .Laxpy_kernel_L999

cmp INC_X, #1
bne axpy_kernel_S_BEGIN
bne .Laxpy_kernel_S_BEGIN
cmp INC_Y, #1
bne axpy_kernel_S_BEGIN
bne .Laxpy_kernel_S_BEGIN

axpy_kernel_F_BEGIN:
.Laxpy_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq axpy_kernel_F1
beq .Laxpy_kernel_F1

axpy_kernel_F8:
.Laxpy_kernel_F8:

KERNEL_F8

subs I, I, #1
bne axpy_kernel_F8
bne .Laxpy_kernel_F8

axpy_kernel_F1:
.Laxpy_kernel_F1:

ands I, N, #7
ble axpy_kernel_L999
ble .Laxpy_kernel_L999

axpy_kernel_F10:
.Laxpy_kernel_F10:

KERNEL_F1

subs I, I, #1
bne axpy_kernel_F10
bne .Laxpy_kernel_F10

mov w0, wzr
ret

axpy_kernel_S_BEGIN:
.Laxpy_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble axpy_kernel_S1
ble .Laxpy_kernel_S1

axpy_kernel_S4:
.Laxpy_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -189,21 +189,21 @@ axpy_kernel_S4:
KERNEL_S1

subs I, I, #1
bne axpy_kernel_S4
bne .Laxpy_kernel_S4

axpy_kernel_S1:
.Laxpy_kernel_S1:

ands I, N, #3
ble axpy_kernel_L999
ble .Laxpy_kernel_L999

axpy_kernel_S10:
.Laxpy_kernel_S10:

KERNEL_S1

subs I, I, #1
bne axpy_kernel_S10
bne .Laxpy_kernel_S10

axpy_kernel_L999:
.Laxpy_kernel_L999:

mov w0, wzr
ret

+ 20
- 20
kernel/arm64/casum.S View File

@@ -98,52 +98,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov s1, SUMF

cmp N, xzr
ble asum_kernel_L999
ble .Lcasum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999
ble .Lcasum_kernel_L999

cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lcasum_kernel_S_BEGIN

asum_kernel_F_BEGIN:
.Lcasum_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq asum_kernel_F1
beq .Lcasum_kernel_F1

asum_kernel_F8:
.Lcasum_kernel_F8:

KERNEL_F8

subs I, I, #1
bne asum_kernel_F8
bne .Lcasum_kernel_F8

KERNEL_F8_FINALIZE

asum_kernel_F1:
.Lcasum_kernel_F1:

ands I, N, #7
ble asum_kernel_L999
ble .Lcasum_kernel_L999

asum_kernel_F10:
.Lcasum_kernel_F10:

KERNEL_F1

subs I, I, #1
bne asum_kernel_F10
bne .Lcasum_kernel_F10

asum_kernel_L999:
.Lcasum_kernel_L999:
ret

asum_kernel_S_BEGIN:
.Lcasum_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble asum_kernel_S1
ble .Lcasum_kernel_S1

asum_kernel_S4:
.Lcasum_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -151,19 +151,19 @@ asum_kernel_S4:
KERNEL_S1

subs I, I, #1
bne asum_kernel_S4
bne .Lcasum_kernel_S4

asum_kernel_S1:
.Lcasum_kernel_S1:

ands I, N, #3
ble asum_kernel_L999
ble .Lcasum_kernel_L999

asum_kernel_S10:
.Lcasum_kernel_S10:

KERNEL_S1

subs I, I, #1
bne asum_kernel_S10
bne .Lcasum_kernel_S10

ret



+ 142
- 142
kernel/arm64/cgemm_kernel_4x4.S View File

@@ -1072,11 +1072,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN

/******************************************************************************/

cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2

@@ -1084,96 +1084,96 @@ cgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array
add ppA, temp, pA

cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN

cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:

mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32

KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K

subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a
.align 5

cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:

KERNEL8x4_M1
KERNEL8x4_M2

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22


cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:

KERNEL8x4_M1
KERNEL8x4_E

b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44

cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:

tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40

KERNEL8x4_I
KERNEL8x4_E

b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44


cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:

INIT8x4

cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:

ands counterL , origK, #1
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100

cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB

cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:

SAVE8x4

cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp
add ppA, ppA, temp
subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20


cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END

tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN

cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:

INIT4x4

mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40

cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:

KERNEL4x4_SUB
KERNEL4x4_SUB
@@ -1186,47 +1186,47 @@ cgemm_kernel_L4_M4_22:
KERNEL4x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22


cgemm_kernel_L4_M4_40:
.Lcgemm_kernel_L4_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100

cgemm_kernel_L4_M4_42:
.Lcgemm_kernel_L4_M4_42:

KERNEL4x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_42
bgt .Lcgemm_kernel_L4_M4_42

cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:

SAVE4x4

cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:


cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN

cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:

INIT2x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40

cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -1239,43 +1239,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22


cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100

cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42

cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:

SAVE2x4

cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:


cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END

cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:

INIT1x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40

cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1287,45 +1287,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22


cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100

cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42

cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:

SAVE1x4


cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:

lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8

subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN


/******************************************************************************/

cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1335,24 +1335,24 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction



cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN

cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:

INIT4x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5

cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1364,50 +1364,50 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22


cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100

cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42

cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:

SAVE4x2

cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:

subs counterI, counterI, #1
bgt cgemm_kernel_L2_M4_20
bgt .Lcgemm_kernel_L2_M4_20


cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN

cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:

INIT2x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40

cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1420,43 +1420,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22


cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100

cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42

cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:

SAVE2x2

cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:


cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END

cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:

INIT1x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40

cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1468,36 +1468,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22


cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100

cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42

cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:

SAVE1x2


cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8

/******************************************************************************/

cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1507,24 +1507,24 @@ cgemm_kernel_L1_BEGIN:



cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN

cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:

INIT4x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5

cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1536,50 +1536,50 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22


cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100

cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42

cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:

SAVE4x1

cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:

subs counterI, counterI, #1
bgt cgemm_kernel_L1_M4_20
bgt .Lcgemm_kernel_L1_M4_20


cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN

cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:

INIT2x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40

cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1592,43 +1592,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22


cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100

cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42

cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:

SAVE2x1

cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:


cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END

cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:

INIT1x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40

cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1640,30 +1640,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22


cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100

cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42

cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:

SAVE1x1


cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:


cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 175
- 175
kernel/arm64/cgemm_kernel_8x4.S View File

@@ -1407,11 +1407,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN

/******************************************************************************/

cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@@ -1421,21 +1421,21 @@ cgemm_kernel_L4_BEGIN:

mov pA, origPA // pA = start of A array

cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN

.align 5
cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:

mov pB, origPB

asr counterL , origK, #3
cmp counterL , #2
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32

KERNEL8x4_I
KERNEL8x4_M2
@@ -1447,10 +1447,10 @@ cgemm_kernel_L4_M8_20:
KERNEL8x4_M2

subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a

.align 5
cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:

KERNEL8x4_M1
KERNEL8x4_M2
@@ -1462,10 +1462,10 @@ cgemm_kernel_L4_M8_22:
KERNEL8x4_M2

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22

.align 5
cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:

KERNEL8x4_M1
KERNEL8x4_M2
@@ -1476,13 +1476,13 @@ cgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E

b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44

.align 5
cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:

tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40

KERNEL8x4_I
KERNEL8x4_M2
@@ -1493,116 +1493,116 @@ cgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E

b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44

cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:

INIT8x4

cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:

ands counterL , origK, #7
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100

.align 5
cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:

KERNEL8x4_SUB

subs counterL, counterL, #1
bne cgemm_kernel_L4_M8_46
bne .Lcgemm_kernel_L4_M8_46

cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]

SAVE8x4

cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20

cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END

tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN


cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:

mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M4_32
blt .Lcgemm_kernel_L4_M4_32

KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K

subs counterL, counterL, #2
ble cgemm_kernel_L4_M4_22a
ble .Lcgemm_kernel_L4_M4_22a
.align 5


cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:

KERNEL4x4_M1
KERNEL4x4_M2

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22

cgemm_kernel_L4_M4_22a:
.Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_32:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_32:
tst counterL, #1
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_40:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_40:

INIT4x4

cgemm_kernel_L4_M4_44:
.Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100

cgemm_kernel_L4_M4_46:
.Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB

cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:

SAVE4x4

cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:

cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN

cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:

INIT2x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40

cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -1615,43 +1615,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22


cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100

cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42

cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:

SAVE2x4

cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:


cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END

cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:

INIT1x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40

cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1663,45 +1663,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22


cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100

cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42

cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:

SAVE1x4


cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:

lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8

subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN


/******************************************************************************/

cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1710,14 +1710,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A


cgemm_kernel_L2_M8_BEGIN:
.Lcgemm_kernel_L2_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L2_M4_BEGIN
ble .Lcgemm_kernel_L2_M4_BEGIN

cgemm_kernel_L2_M8_20:
.Lcgemm_kernel_L2_M8_20:

INIT8x2

@@ -1725,10 +1725,10 @@ cgemm_kernel_L2_M8_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M8_40
ble .Lcgemm_kernel_L2_M8_40
.align 5

cgemm_kernel_L2_M8_22:
.Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
@@ -1740,50 +1740,50 @@ cgemm_kernel_L2_M8_22:
KERNEL8x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_22
bgt .Lcgemm_kernel_L2_M8_22


cgemm_kernel_L2_M8_40:
.Lcgemm_kernel_L2_M8_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M8_100
ble .Lcgemm_kernel_L2_M8_100

cgemm_kernel_L2_M8_42:
.Lcgemm_kernel_L2_M8_42:

KERNEL8x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_42
bgt .Lcgemm_kernel_L2_M8_42

cgemm_kernel_L2_M8_100:
.Lcgemm_kernel_L2_M8_100:

SAVE8x2

cgemm_kernel_L2_M8_END:
.Lcgemm_kernel_L2_M8_END:

subs counterI, counterI, #1
bgt cgemm_kernel_L2_M8_20
bgt .Lcgemm_kernel_L2_M8_20

cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END

tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN

cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:

INIT4x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5

cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1795,46 +1795,46 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22


cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100

cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42

cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:

SAVE4x2

cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:

cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN

cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:

INIT2x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40

cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1847,43 +1847,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22


cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100

cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42

cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:

SAVE2x2

cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:


cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END

cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:

INIT1x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40

cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1895,36 +1895,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22


cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100

cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42

cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:

SAVE1x2


cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8

/******************************************************************************/

cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1933,24 +1933,24 @@ cgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A


cgemm_kernel_L1_M8_BEGIN:
.Lcgemm_kernel_L1_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L1_M4_BEGIN
ble .Lcgemm_kernel_L1_M4_BEGIN

cgemm_kernel_L1_M8_20:
.Lcgemm_kernel_L1_M8_20:

INIT8x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M8_40
ble .Lcgemm_kernel_L1_M8_40
.align 5

cgemm_kernel_L1_M8_22:
.Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@@ -1962,51 +1962,51 @@ cgemm_kernel_L1_M8_22:
KERNEL8x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_22
bgt .Lcgemm_kernel_L1_M8_22


cgemm_kernel_L1_M8_40:
.Lcgemm_kernel_L1_M8_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M8_100
ble .Lcgemm_kernel_L1_M8_100

cgemm_kernel_L1_M8_42:
.Lcgemm_kernel_L1_M8_42:

KERNEL8x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_42
bgt .Lcgemm_kernel_L1_M8_42

cgemm_kernel_L1_M8_100:
.Lcgemm_kernel_L1_M8_100:

SAVE8x1

cgemm_kernel_L1_M8_END:
.Lcgemm_kernel_L1_M8_END:

subs counterI, counterI, #1
bgt cgemm_kernel_L1_M8_20
bgt .Lcgemm_kernel_L1_M8_20

cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END

tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN


cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:

INIT4x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5

cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -2018,47 +2018,47 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22


cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100

cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42

cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:

SAVE4x1

cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:


cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN

cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:

INIT2x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40

cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -2071,43 +2071,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22


cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100

cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42

cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:

SAVE2x1

cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:


cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END

cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:

INIT1x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40

cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -2119,30 +2119,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22


cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100

cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42

cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:

SAVE1x1


cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:


cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 175
- 175
kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S View File

@@ -1432,11 +1432,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN

/******************************************************************************/

cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@@ -1446,21 +1446,21 @@ cgemm_kernel_L4_BEGIN:

mov pA, origPA // pA = start of A array

cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN

.align 5
cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:

mov pB, origPB

asr counterL , origK, #5 // origK / 32
cmp counterL , #2
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32

KERNEL8x4_I
KERNEL8x4_M2
@@ -1470,18 +1470,18 @@ cgemm_kernel_L4_M8_20:
KERNEL8x4_M1_M2_x8

subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a

.align 5
cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:

KERNEL8x4_M1_M2_x16

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22

.align 5
cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:

KERNEL8x4_M1_M2_x8
KERNEL8x4_M1_M2_x4
@@ -1490,13 +1490,13 @@ cgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E

b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44

.align 5
cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:

tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40

KERNEL8x4_I
KERNEL8x4_M2
@@ -1506,116 +1506,116 @@ cgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E

b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44

cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:

INIT8x4

cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:

ands counterL , origK, #31
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100

.align 5
cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:

KERNEL8x4_SUB

subs counterL, counterL, #1
bne cgemm_kernel_L4_M8_46
bne .Lcgemm_kernel_L4_M8_46

cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]

SAVE8x4

cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20

cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END

tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN


cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:

mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M4_32
blt .Lcgemm_kernel_L4_M4_32

KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K

subs counterL, counterL, #2
ble cgemm_kernel_L4_M4_22a
ble .Lcgemm_kernel_L4_M4_22a
.align 5


cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:

KERNEL4x4_M1
KERNEL4x4_M2

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22

cgemm_kernel_L4_M4_22a:
.Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_32:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_32:
tst counterL, #1
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_40:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_40:

INIT4x4

cgemm_kernel_L4_M4_44:
.Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100

cgemm_kernel_L4_M4_46:
.Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB

cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:

SAVE4x4

cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:

cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN

cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:

INIT2x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40

cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -1628,43 +1628,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22


cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100

cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42

cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:

SAVE2x4

cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:


cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END

cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:

INIT1x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40

cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1676,45 +1676,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22


cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100

cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42

cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:

SAVE1x4


cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:

lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8

subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN


/******************************************************************************/

cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1723,14 +1723,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A


cgemm_kernel_L2_M8_BEGIN:
.Lcgemm_kernel_L2_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L2_M4_BEGIN
ble .Lcgemm_kernel_L2_M4_BEGIN

cgemm_kernel_L2_M8_20:
.Lcgemm_kernel_L2_M8_20:

INIT8x2

@@ -1738,10 +1738,10 @@ cgemm_kernel_L2_M8_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M8_40
ble .Lcgemm_kernel_L2_M8_40
.align 5

cgemm_kernel_L2_M8_22:
.Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
@@ -1753,50 +1753,50 @@ cgemm_kernel_L2_M8_22:
KERNEL8x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_22
bgt .Lcgemm_kernel_L2_M8_22


cgemm_kernel_L2_M8_40:
.Lcgemm_kernel_L2_M8_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M8_100
ble .Lcgemm_kernel_L2_M8_100

cgemm_kernel_L2_M8_42:
.Lcgemm_kernel_L2_M8_42:

KERNEL8x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_42
bgt .Lcgemm_kernel_L2_M8_42

cgemm_kernel_L2_M8_100:
.Lcgemm_kernel_L2_M8_100:

SAVE8x2

cgemm_kernel_L2_M8_END:
.Lcgemm_kernel_L2_M8_END:

subs counterI, counterI, #1
bgt cgemm_kernel_L2_M8_20
bgt .Lcgemm_kernel_L2_M8_20

cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END

tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN

cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:

INIT4x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5

cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1808,46 +1808,46 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22


cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100

cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42

cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:

SAVE4x2

cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:

cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN

cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:

INIT2x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40

cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1860,43 +1860,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22


cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100

cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42

cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:

SAVE2x2

cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:


cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END

cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:

INIT1x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40

cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1908,36 +1908,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22


cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100

cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42

cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:

SAVE1x2


cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8

/******************************************************************************/

cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1946,24 +1946,24 @@ cgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A


cgemm_kernel_L1_M8_BEGIN:
.Lcgemm_kernel_L1_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble cgemm_kernel_L1_M4_BEGIN
ble .Lcgemm_kernel_L1_M4_BEGIN

cgemm_kernel_L1_M8_20:
.Lcgemm_kernel_L1_M8_20:

INIT8x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M8_40
ble .Lcgemm_kernel_L1_M8_40
.align 5

cgemm_kernel_L1_M8_22:
.Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@@ -1975,51 +1975,51 @@ cgemm_kernel_L1_M8_22:
KERNEL8x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_22
bgt .Lcgemm_kernel_L1_M8_22


cgemm_kernel_L1_M8_40:
.Lcgemm_kernel_L1_M8_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M8_100
ble .Lcgemm_kernel_L1_M8_100

cgemm_kernel_L1_M8_42:
.Lcgemm_kernel_L1_M8_42:

KERNEL8x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_42
bgt .Lcgemm_kernel_L1_M8_42

cgemm_kernel_L1_M8_100:
.Lcgemm_kernel_L1_M8_100:

SAVE8x1

cgemm_kernel_L1_M8_END:
.Lcgemm_kernel_L1_M8_END:

subs counterI, counterI, #1
bgt cgemm_kernel_L1_M8_20
bgt .Lcgemm_kernel_L1_M8_20

cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END

tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN


cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:

INIT4x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5

cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -2031,47 +2031,47 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22


cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100

cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42

cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:

SAVE4x1

cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:


cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN

cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:

INIT2x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40

cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -2084,43 +2084,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22


cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100

cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42

cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:

SAVE2x1

cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:


cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END

cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:

INIT1x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40

cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -2132,30 +2132,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22


cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100

cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42

cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:

SAVE1x1


cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:


cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 20
- 20
kernel/arm64/copy.S View File

@@ -159,50 +159,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

cmp N, xzr
ble copy_kernel_L999
ble .Lcopy_kernel_L999

cmp INC_X, #1
bne copy_kernel_S_BEGIN
bne .Lcopy_kernel_S_BEGIN
cmp INC_Y, #1
bne copy_kernel_S_BEGIN
bne .Lcopy_kernel_S_BEGIN

copy_kernel_F_BEGIN:
.Lcopy_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq copy_kernel_F1
beq .Lcopy_kernel_F1

copy_kernel_F4:
.Lcopy_kernel_F4:

KERNEL_F4

subs I, I, #1
bne copy_kernel_F4
bne .Lcopy_kernel_F4

copy_kernel_F1:
.Lcopy_kernel_F1:

ands I, N, #3
ble copy_kernel_L999
ble .Lcopy_kernel_L999

copy_kernel_F10:
.Lcopy_kernel_F10:

KERNEL_F1

subs I, I, #1
bne copy_kernel_F10
bne .Lcopy_kernel_F10

mov w0, wzr
ret

copy_kernel_S_BEGIN:
.Lcopy_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble copy_kernel_S1
ble .Lcopy_kernel_S1

copy_kernel_S4:
.Lcopy_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -210,21 +210,21 @@ copy_kernel_S4:
KERNEL_S1

subs I, I, #1
bne copy_kernel_S4
bne .Lcopy_kernel_S4

copy_kernel_S1:
.Lcopy_kernel_S1:

ands I, N, #3
ble copy_kernel_L999
ble .Lcopy_kernel_L999

copy_kernel_S10:
.Lcopy_kernel_S10:

KERNEL_S1

subs I, I, #1
bne copy_kernel_S10
bne .Lcopy_kernel_S10

copy_kernel_L999:
.Lcopy_kernel_L999:

mov w0, wzr
ret


+ 129
- 129
kernel/arm64/ctrmm_kernel_4x4.S View File

@@ -785,11 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble ctrmm_kernel_L2_BEGIN
ble .Lctrmm_kernel_L2_BEGIN

/******************************************************************************/

ctrmm_kernel_L4_BEGIN:
.Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2

@@ -798,14 +798,14 @@ ctrmm_kernel_L4_BEGIN:
#endif
mov pA, origPA // pA = start of A array

ctrmm_kernel_L4_M4_BEGIN:
.Lctrmm_kernel_L4_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble ctrmm_kernel_L4_M2_BEGIN
ble .Lctrmm_kernel_L4_M2_BEGIN

ctrmm_kernel_L4_M4_20:
.Lctrmm_kernel_L4_M4_20:

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@@ -826,55 +826,55 @@ ctrmm_kernel_L4_M4_20:

asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt ctrmm_kernel_L4_M4_32
blt .Lctrmm_kernel_L4_M4_32

KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K

subs counterL, counterL, #2
ble ctrmm_kernel_L4_M4_22a
ble .Lctrmm_kernel_L4_M4_22a
.align 5

ctrmm_kernel_L4_M4_22:
.Lctrmm_kernel_L4_M4_22:

KERNEL4x4_M1
KERNEL4x4_M2

subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22
bgt .Lctrmm_kernel_L4_M4_22


ctrmm_kernel_L4_M4_22a:
.Lctrmm_kernel_L4_M4_22a:

KERNEL4x4_M1
KERNEL4x4_E

b ctrmm_kernel_L4_M4_44
b .Lctrmm_kernel_L4_M4_44

ctrmm_kernel_L4_M4_32:
.Lctrmm_kernel_L4_M4_32:

tst counterL, #1
ble ctrmm_kernel_L4_M4_40
ble .Lctrmm_kernel_L4_M4_40

KERNEL4x4_I
KERNEL4x4_E

b ctrmm_kernel_L4_M4_44
b .Lctrmm_kernel_L4_M4_44


ctrmm_kernel_L4_M4_40:
.Lctrmm_kernel_L4_M4_40:

INIT4x4

ctrmm_kernel_L4_M4_44:
.Lctrmm_kernel_L4_M4_44:

ands counterL , tempK, #1
ble ctrmm_kernel_L4_M4_100
ble .Lctrmm_kernel_L4_M4_100

ctrmm_kernel_L4_M4_46:
.Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB

ctrmm_kernel_L4_M4_100:
.Lctrmm_kernel_L4_M4_100:

SAVE4x4

@@ -893,20 +893,20 @@ ctrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif

ctrmm_kernel_L4_M4_END:
.Lctrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne ctrmm_kernel_L4_M4_20
bne .Lctrmm_kernel_L4_M4_20

ctrmm_kernel_L4_M2_BEGIN:
.Lctrmm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L4_M1_BEGIN
ble .Lctrmm_kernel_L4_M1_BEGIN

ctrmm_kernel_L4_M2_20:
.Lctrmm_kernel_L4_M2_20:

INIT2x4

@@ -930,9 +930,9 @@ ctrmm_kernel_L4_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L4_M2_40
ble .Lctrmm_kernel_L4_M2_40

ctrmm_kernel_L4_M2_22:
.Lctrmm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -945,22 +945,22 @@ ctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_22
bgt .Lctrmm_kernel_L4_M2_22


ctrmm_kernel_L4_M2_40:
.Lctrmm_kernel_L4_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M2_100
ble .Lctrmm_kernel_L4_M2_100

ctrmm_kernel_L4_M2_42:
.Lctrmm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_42
bgt .Lctrmm_kernel_L4_M2_42

ctrmm_kernel_L4_M2_100:
.Lctrmm_kernel_L4_M2_100:

SAVE2x4

@@ -980,15 +980,15 @@ ctrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif

ctrmm_kernel_L4_M2_END:
.Lctrmm_kernel_L4_M2_END:


ctrmm_kernel_L4_M1_BEGIN:
.Lctrmm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END

ctrmm_kernel_L4_M1_20:
.Lctrmm_kernel_L4_M1_20:

INIT1x4

@@ -1012,9 +1012,9 @@ ctrmm_kernel_L4_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L4_M1_40
ble .Lctrmm_kernel_L4_M1_40

ctrmm_kernel_L4_M1_22:
.Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1026,22 +1026,22 @@ ctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_22
bgt .Lctrmm_kernel_L4_M1_22


ctrmm_kernel_L4_M1_40:
.Lctrmm_kernel_L4_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M1_100
ble .Lctrmm_kernel_L4_M1_100

ctrmm_kernel_L4_M1_42:
.Lctrmm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_42
bgt .Lctrmm_kernel_L4_M1_42

ctrmm_kernel_L4_M1_100:
.Lctrmm_kernel_L4_M1_100:

SAVE1x4

@@ -1061,7 +1061,7 @@ ctrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1
#endif

ctrmm_kernel_L4_END:
.Lctrmm_kernel_L4_END:

lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@@ -1071,19 +1071,19 @@ ctrmm_kernel_L4_END:
#endif

subs counterJ, counterJ , #1 // j--
bgt ctrmm_kernel_L4_BEGIN
bgt .Lctrmm_kernel_L4_BEGIN


/******************************************************************************/

ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble ctrmm_kernel_L999 // error, N was less than 4?
ble .Lctrmm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble ctrmm_kernel_L1_BEGIN
ble .Lctrmm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1095,14 +1095,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction

mov pA, origPA // pA = A

ctrmm_kernel_L2_M4_BEGIN:
.Lctrmm_kernel_L2_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble ctrmm_kernel_L2_M2_BEGIN
ble .Lctrmm_kernel_L2_M2_BEGIN

ctrmm_kernel_L2_M4_20:
.Lctrmm_kernel_L2_M4_20:

INIT4x2

@@ -1126,10 +1126,10 @@ ctrmm_kernel_L2_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M4_40
ble .Lctrmm_kernel_L2_M4_40
.align 5

ctrmm_kernel_L2_M4_22:
.Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1141,22 +1141,22 @@ ctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_22
bgt .Lctrmm_kernel_L2_M4_22


ctrmm_kernel_L2_M4_40:
.Lctrmm_kernel_L2_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M4_100
ble .Lctrmm_kernel_L2_M4_100

ctrmm_kernel_L2_M4_42:
.Lctrmm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_42
bgt .Lctrmm_kernel_L2_M4_42

ctrmm_kernel_L2_M4_100:
.Lctrmm_kernel_L2_M4_100:

SAVE4x2

@@ -1176,22 +1176,22 @@ ctrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif

ctrmm_kernel_L2_M4_END:
.Lctrmm_kernel_L2_M4_END:

subs counterI, counterI, #1
bgt ctrmm_kernel_L2_M4_20
bgt .Lctrmm_kernel_L2_M4_20


ctrmm_kernel_L2_M2_BEGIN:
.Lctrmm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L2_M1_BEGIN
ble .Lctrmm_kernel_L2_M1_BEGIN

ctrmm_kernel_L2_M2_20:
.Lctrmm_kernel_L2_M2_20:

INIT2x2

@@ -1215,9 +1215,9 @@ ctrmm_kernel_L2_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M2_40
ble .Lctrmm_kernel_L2_M2_40

ctrmm_kernel_L2_M2_22:
.Lctrmm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1230,22 +1230,22 @@ ctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_22
bgt .Lctrmm_kernel_L2_M2_22


ctrmm_kernel_L2_M2_40:
.Lctrmm_kernel_L2_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M2_100
ble .Lctrmm_kernel_L2_M2_100

ctrmm_kernel_L2_M2_42:
.Lctrmm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_42
bgt .Lctrmm_kernel_L2_M2_42

ctrmm_kernel_L2_M2_100:
.Lctrmm_kernel_L2_M2_100:

SAVE2x2

@@ -1265,15 +1265,15 @@ ctrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif

ctrmm_kernel_L2_M2_END:
.Lctrmm_kernel_L2_M2_END:


ctrmm_kernel_L2_M1_BEGIN:
.Lctrmm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END

ctrmm_kernel_L2_M1_20:
.Lctrmm_kernel_L2_M1_20:

INIT1x2

@@ -1297,9 +1297,9 @@ ctrmm_kernel_L2_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble ctrmm_kernel_L2_M1_40
ble .Lctrmm_kernel_L2_M1_40

ctrmm_kernel_L2_M1_22:
.Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1311,22 +1311,22 @@ ctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_22
bgt .Lctrmm_kernel_L2_M1_22


ctrmm_kernel_L2_M1_40:
.Lctrmm_kernel_L2_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M1_100
ble .Lctrmm_kernel_L2_M1_100

ctrmm_kernel_L2_M1_42:
.Lctrmm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_42
bgt .Lctrmm_kernel_L2_M1_42

ctrmm_kernel_L2_M1_100:
.Lctrmm_kernel_L2_M1_100:

SAVE1x2

@@ -1346,7 +1346,7 @@ ctrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif

ctrmm_kernel_L2_END:
.Lctrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@@ -1354,11 +1354,11 @@ ctrmm_kernel_L2_END:

/******************************************************************************/

ctrmm_kernel_L1_BEGIN:
.Lctrmm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble ctrmm_kernel_L999 // done
ble .Lctrmm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1370,14 +1370,14 @@ ctrmm_kernel_L1_BEGIN:

mov pA, origPA // pA = A

ctrmm_kernel_L1_M4_BEGIN:
.Lctrmm_kernel_L1_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble ctrmm_kernel_L1_M2_BEGIN
ble .Lctrmm_kernel_L1_M2_BEGIN

ctrmm_kernel_L1_M4_20:
.Lctrmm_kernel_L1_M4_20:

INIT4x1

@@ -1401,10 +1401,10 @@ ctrmm_kernel_L1_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M4_40
ble .Lctrmm_kernel_L1_M4_40
.align 5

ctrmm_kernel_L1_M4_22:
.Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1416,22 +1416,22 @@ ctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_22
bgt .Lctrmm_kernel_L1_M4_22


ctrmm_kernel_L1_M4_40:
.Lctrmm_kernel_L1_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M4_100
ble .Lctrmm_kernel_L1_M4_100

ctrmm_kernel_L1_M4_42:
.Lctrmm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_42
bgt .Lctrmm_kernel_L1_M4_42

ctrmm_kernel_L1_M4_100:
.Lctrmm_kernel_L1_M4_100:

SAVE4x1

@@ -1451,22 +1451,22 @@ ctrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif

ctrmm_kernel_L1_M4_END:
.Lctrmm_kernel_L1_M4_END:

subs counterI, counterI, #1
bgt ctrmm_kernel_L1_M4_20
bgt .Lctrmm_kernel_L1_M4_20


ctrmm_kernel_L1_M2_BEGIN:
.Lctrmm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L1_M1_BEGIN
ble .Lctrmm_kernel_L1_M1_BEGIN

ctrmm_kernel_L1_M2_20:
.Lctrmm_kernel_L1_M2_20:

INIT2x1

@@ -1490,9 +1490,9 @@ ctrmm_kernel_L1_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M2_40
ble .Lctrmm_kernel_L1_M2_40

ctrmm_kernel_L1_M2_22:
.Lctrmm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1505,22 +1505,22 @@ ctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_22
bgt .Lctrmm_kernel_L1_M2_22


ctrmm_kernel_L1_M2_40:
.Lctrmm_kernel_L1_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M2_100
ble .Lctrmm_kernel_L1_M2_100

ctrmm_kernel_L1_M2_42:
.Lctrmm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_42
bgt .Lctrmm_kernel_L1_M2_42

ctrmm_kernel_L1_M2_100:
.Lctrmm_kernel_L1_M2_100:

SAVE2x1

@@ -1540,15 +1540,15 @@ ctrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif

ctrmm_kernel_L1_M2_END:
.Lctrmm_kernel_L1_M2_END:


ctrmm_kernel_L1_M1_BEGIN:
.Lctrmm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END

ctrmm_kernel_L1_M1_20:
.Lctrmm_kernel_L1_M1_20:

INIT1x1

@@ -1572,9 +1572,9 @@ ctrmm_kernel_L1_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M1_40
ble .Lctrmm_kernel_L1_M1_40

ctrmm_kernel_L1_M1_22:
.Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1586,30 +1586,30 @@ ctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_22
bgt .Lctrmm_kernel_L1_M1_22


ctrmm_kernel_L1_M1_40:
.Lctrmm_kernel_L1_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M1_100
ble .Lctrmm_kernel_L1_M1_100

ctrmm_kernel_L1_M1_42:
.Lctrmm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_42
bgt .Lctrmm_kernel_L1_M1_42

ctrmm_kernel_L1_M1_100:
.Lctrmm_kernel_L1_M1_100:

SAVE1x1


ctrmm_kernel_L1_END:
.Lctrmm_kernel_L1_END:


ctrmm_kernel_L999:
.Lctrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 175
- 175
kernel/arm64/ctrmm_kernel_8x4.S View File

@@ -1405,11 +1405,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble ctrmm_kernel_L2_BEGIN
ble .Lctrmm_kernel_L2_BEGIN

/******************************************************************************/

ctrmm_kernel_L4_BEGIN:
.Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@@ -1423,14 +1423,14 @@ ctrmm_kernel_L4_BEGIN:
#endif
mov pA, origPA // pA = start of A array

ctrmm_kernel_L4_M8_BEGIN:
.Lctrmm_kernel_L4_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble ctrmm_kernel_L4_M4_BEGIN
ble .Lctrmm_kernel_L4_M4_BEGIN

ctrmm_kernel_L4_M8_20:
.Lctrmm_kernel_L4_M8_20:

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@@ -1452,7 +1452,7 @@ ctrmm_kernel_L4_M8_20:

asr counterL , tempK, #3
cmp counterL , #2
blt ctrmm_kernel_L4_M8_32
blt .Lctrmm_kernel_L4_M8_32

KERNEL8x4_I
KERNEL8x4_M2
@@ -1464,10 +1464,10 @@ ctrmm_kernel_L4_M8_20:
KERNEL8x4_M2

subs counterL, counterL, #2 // subtract 2
ble ctrmm_kernel_L4_M8_22a
ble .Lctrmm_kernel_L4_M8_22a

.align 5
ctrmm_kernel_L4_M8_22:
.Lctrmm_kernel_L4_M8_22:

KERNEL8x4_M1
KERNEL8x4_M2
@@ -1479,10 +1479,10 @@ ctrmm_kernel_L4_M8_22:
KERNEL8x4_M2

subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M8_22
bgt .Lctrmm_kernel_L4_M8_22

.align 5
ctrmm_kernel_L4_M8_22a:
.Lctrmm_kernel_L4_M8_22a:

KERNEL8x4_M1
KERNEL8x4_M2
@@ -1493,13 +1493,13 @@ ctrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E

b ctrmm_kernel_L4_M8_44
b .Lctrmm_kernel_L4_M8_44

.align 5
ctrmm_kernel_L4_M8_32:
.Lctrmm_kernel_L4_M8_32:

tst counterL, #1
ble ctrmm_kernel_L4_M8_40
ble .Lctrmm_kernel_L4_M8_40

KERNEL8x4_I
KERNEL8x4_M2
@@ -1510,26 +1510,26 @@ ctrmm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E

b ctrmm_kernel_L4_M8_44
b .Lctrmm_kernel_L4_M8_44

ctrmm_kernel_L4_M8_40:
.Lctrmm_kernel_L4_M8_40:

INIT8x4

ctrmm_kernel_L4_M8_44:
.Lctrmm_kernel_L4_M8_44:

ands counterL , tempK, #7
ble ctrmm_kernel_L4_M8_100
ble .Lctrmm_kernel_L4_M8_100

.align 5
ctrmm_kernel_L4_M8_46:
.Lctrmm_kernel_L4_M8_46:

KERNEL8x4_SUB

subs counterL, counterL, #1
bne ctrmm_kernel_L4_M8_46
bne .Lctrmm_kernel_L4_M8_46

ctrmm_kernel_L4_M8_100:
.Lctrmm_kernel_L4_M8_100:

SAVE8x4

@@ -1552,21 +1552,21 @@ ctrmm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]

ctrmm_kernel_L4_M8_END:
.Lctrmm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne ctrmm_kernel_L4_M8_20
bne .Lctrmm_kernel_L4_M8_20

ctrmm_kernel_L4_M4_BEGIN:
.Lctrmm_kernel_L4_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END

tst counterI, #4
ble ctrmm_kernel_L4_M2_BEGIN
ble .Lctrmm_kernel_L4_M2_BEGIN


ctrmm_kernel_L4_M4_20:
.Lctrmm_kernel_L4_M4_20:

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@@ -1587,46 +1587,46 @@ ctrmm_kernel_L4_M4_20:

asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt ctrmm_kernel_L4_M4_32
blt .Lctrmm_kernel_L4_M4_32

KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K

subs counterL, counterL, #2
ble ctrmm_kernel_L4_M4_22a
ble .Lctrmm_kernel_L4_M4_22a
.align 5


ctrmm_kernel_L4_M4_22:
.Lctrmm_kernel_L4_M4_22:

KERNEL4x4_M1
KERNEL4x4_M2

subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22
bgt .Lctrmm_kernel_L4_M4_22

ctrmm_kernel_L4_M4_22a:
.Lctrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_32:
b .Lctrmm_kernel_L4_M4_44
.Lctrmm_kernel_L4_M4_32:
tst counterL, #1
ble ctrmm_kernel_L4_M4_40
ble .Lctrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_40:
b .Lctrmm_kernel_L4_M4_44
.Lctrmm_kernel_L4_M4_40:

INIT4x4

ctrmm_kernel_L4_M4_44:
.Lctrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
ble ctrmm_kernel_L4_M4_100
ble .Lctrmm_kernel_L4_M4_100

ctrmm_kernel_L4_M4_46:
.Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB

ctrmm_kernel_L4_M4_100:
.Lctrmm_kernel_L4_M4_100:

SAVE4x4

@@ -1645,18 +1645,18 @@ ctrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif

ctrmm_kernel_L4_M4_END:
.Lctrmm_kernel_L4_M4_END:

ctrmm_kernel_L4_M2_BEGIN:
.Lctrmm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L4_M1_BEGIN
ble .Lctrmm_kernel_L4_M1_BEGIN

ctrmm_kernel_L4_M2_20:
.Lctrmm_kernel_L4_M2_20:

INIT2x4

@@ -1679,9 +1679,9 @@ ctrmm_kernel_L4_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L4_M2_40
ble .Lctrmm_kernel_L4_M2_40

ctrmm_kernel_L4_M2_22:
.Lctrmm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -1694,22 +1694,22 @@ ctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_22
bgt .Lctrmm_kernel_L4_M2_22


ctrmm_kernel_L4_M2_40:
.Lctrmm_kernel_L4_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M2_100
ble .Lctrmm_kernel_L4_M2_100

ctrmm_kernel_L4_M2_42:
.Lctrmm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_42
bgt .Lctrmm_kernel_L4_M2_42

ctrmm_kernel_L4_M2_100:
.Lctrmm_kernel_L4_M2_100:

SAVE2x4

@@ -1729,15 +1729,15 @@ ctrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif

ctrmm_kernel_L4_M2_END:
.Lctrmm_kernel_L4_M2_END:


ctrmm_kernel_L4_M1_BEGIN:
.Lctrmm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END

ctrmm_kernel_L4_M1_20:
.Lctrmm_kernel_L4_M1_20:

INIT1x4

@@ -1761,9 +1761,9 @@ ctrmm_kernel_L4_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L4_M1_40
ble .Lctrmm_kernel_L4_M1_40

ctrmm_kernel_L4_M1_22:
.Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1775,22 +1775,22 @@ ctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_22
bgt .Lctrmm_kernel_L4_M1_22


ctrmm_kernel_L4_M1_40:
.Lctrmm_kernel_L4_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M1_100
ble .Lctrmm_kernel_L4_M1_100

ctrmm_kernel_L4_M1_42:
.Lctrmm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_42
bgt .Lctrmm_kernel_L4_M1_42

ctrmm_kernel_L4_M1_100:
.Lctrmm_kernel_L4_M1_100:

SAVE1x4

@@ -1810,7 +1810,7 @@ ctrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1
#endif

ctrmm_kernel_L4_END:
.Lctrmm_kernel_L4_END:

lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@@ -1820,19 +1820,19 @@ ctrmm_kernel_L4_END:
#endif

subs counterJ, counterJ , #1 // j--
bgt ctrmm_kernel_L4_BEGIN
bgt .Lctrmm_kernel_L4_BEGIN


/******************************************************************************/

ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble ctrmm_kernel_L999 // error, N was less than 4?
ble .Lctrmm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble ctrmm_kernel_L1_BEGIN
ble .Lctrmm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1843,14 +1843,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
#endif
mov pA, origPA // pA = A

ctrmm_kernel_L2_M8_BEGIN:
.Lctrmm_kernel_L2_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble ctrmm_kernel_L2_M4_BEGIN
ble .Lctrmm_kernel_L2_M4_BEGIN

ctrmm_kernel_L2_M8_20:
.Lctrmm_kernel_L2_M8_20:

INIT8x2

@@ -1874,10 +1874,10 @@ ctrmm_kernel_L2_M8_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M8_40
ble .Lctrmm_kernel_L2_M8_40
.align 5

ctrmm_kernel_L2_M8_22:
.Lctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
@@ -1889,22 +1889,22 @@ ctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M8_22
bgt .Lctrmm_kernel_L2_M8_22


ctrmm_kernel_L2_M8_40:
.Lctrmm_kernel_L2_M8_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M8_100
ble .Lctrmm_kernel_L2_M8_100

ctrmm_kernel_L2_M8_42:
.Lctrmm_kernel_L2_M8_42:

KERNEL8x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M8_42
bgt .Lctrmm_kernel_L2_M8_42

ctrmm_kernel_L2_M8_100:
.Lctrmm_kernel_L2_M8_100:

SAVE8x2

@@ -1924,21 +1924,21 @@ ctrmm_kernel_L2_M8_100:
add tempOffset, tempOffset, #8
#endif

ctrmm_kernel_L2_M8_END:
.Lctrmm_kernel_L2_M8_END:

subs counterI, counterI, #1
bgt ctrmm_kernel_L2_M8_20
bgt .Lctrmm_kernel_L2_M8_20

ctrmm_kernel_L2_M4_BEGIN:
.Lctrmm_kernel_L2_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END

tst counterI, #4 // counterI = counterI / 2
ble ctrmm_kernel_L2_M2_BEGIN
ble .Lctrmm_kernel_L2_M2_BEGIN

ctrmm_kernel_L2_M4_20:
.Lctrmm_kernel_L2_M4_20:

INIT4x2

@@ -1962,10 +1962,10 @@ ctrmm_kernel_L2_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M4_40
ble .Lctrmm_kernel_L2_M4_40
.align 5

ctrmm_kernel_L2_M4_22:
.Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1977,22 +1977,22 @@ ctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_22
bgt .Lctrmm_kernel_L2_M4_22


ctrmm_kernel_L2_M4_40:
.Lctrmm_kernel_L2_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M4_100
ble .Lctrmm_kernel_L2_M4_100

ctrmm_kernel_L2_M4_42:
.Lctrmm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_42
bgt .Lctrmm_kernel_L2_M4_42

ctrmm_kernel_L2_M4_100:
.Lctrmm_kernel_L2_M4_100:

SAVE4x2

@@ -2012,19 +2012,19 @@ ctrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif

ctrmm_kernel_L2_M4_END:
.Lctrmm_kernel_L2_M4_END:


ctrmm_kernel_L2_M2_BEGIN:
.Lctrmm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L2_M1_BEGIN
ble .Lctrmm_kernel_L2_M1_BEGIN

ctrmm_kernel_L2_M2_20:
.Lctrmm_kernel_L2_M2_20:

INIT2x2

@@ -2048,9 +2048,9 @@ ctrmm_kernel_L2_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ctrmm_kernel_L2_M2_40
ble .Lctrmm_kernel_L2_M2_40

ctrmm_kernel_L2_M2_22:
.Lctrmm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -2063,22 +2063,22 @@ ctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_22
bgt .Lctrmm_kernel_L2_M2_22


ctrmm_kernel_L2_M2_40:
.Lctrmm_kernel_L2_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M2_100
ble .Lctrmm_kernel_L2_M2_100

ctrmm_kernel_L2_M2_42:
.Lctrmm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_42
bgt .Lctrmm_kernel_L2_M2_42

ctrmm_kernel_L2_M2_100:
.Lctrmm_kernel_L2_M2_100:

SAVE2x2

@@ -2098,15 +2098,15 @@ ctrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif

ctrmm_kernel_L2_M2_END:
.Lctrmm_kernel_L2_M2_END:


ctrmm_kernel_L2_M1_BEGIN:
.Lctrmm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END

ctrmm_kernel_L2_M1_20:
.Lctrmm_kernel_L2_M1_20:

INIT1x2

@@ -2130,9 +2130,9 @@ ctrmm_kernel_L2_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble ctrmm_kernel_L2_M1_40
ble .Lctrmm_kernel_L2_M1_40

ctrmm_kernel_L2_M1_22:
.Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -2144,22 +2144,22 @@ ctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_22
bgt .Lctrmm_kernel_L2_M1_22


ctrmm_kernel_L2_M1_40:
.Lctrmm_kernel_L2_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M1_100
ble .Lctrmm_kernel_L2_M1_100

ctrmm_kernel_L2_M1_42:
.Lctrmm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_42
bgt .Lctrmm_kernel_L2_M1_42

ctrmm_kernel_L2_M1_100:
.Lctrmm_kernel_L2_M1_100:

SAVE1x2

@@ -2179,7 +2179,7 @@ ctrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif

ctrmm_kernel_L2_END:
.Lctrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@@ -2187,11 +2187,11 @@ ctrmm_kernel_L2_END:

/******************************************************************************/

ctrmm_kernel_L1_BEGIN:
.Lctrmm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble ctrmm_kernel_L999 // done
ble .Lctrmm_kernel_L999 // done

mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
@@ -2201,14 +2201,14 @@ ctrmm_kernel_L1_BEGIN:
#endif
mov pA, origPA // pA = A

ctrmm_kernel_L1_M8_BEGIN:
.Lctrmm_kernel_L1_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble ctrmm_kernel_L1_M4_BEGIN
ble .Lctrmm_kernel_L1_M4_BEGIN

ctrmm_kernel_L1_M8_20:
.Lctrmm_kernel_L1_M8_20:

INIT8x1

@@ -2232,10 +2232,10 @@ ctrmm_kernel_L1_M8_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M8_40
ble .Lctrmm_kernel_L1_M8_40
.align 5

ctrmm_kernel_L1_M8_22:
.Lctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@@ -2247,22 +2247,22 @@ ctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M8_22
bgt .Lctrmm_kernel_L1_M8_22


ctrmm_kernel_L1_M8_40:
.Lctrmm_kernel_L1_M8_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M8_100
ble .Lctrmm_kernel_L1_M8_100

ctrmm_kernel_L1_M8_42:
.Lctrmm_kernel_L1_M8_42:

KERNEL8x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M8_42
bgt .Lctrmm_kernel_L1_M8_42

ctrmm_kernel_L1_M8_100:
.Lctrmm_kernel_L1_M8_100:

SAVE8x1

@@ -2282,21 +2282,21 @@ ctrmm_kernel_L1_M8_100:
add tempOffset, tempOffset, #8
#endif

ctrmm_kernel_L1_M8_END:
.Lctrmm_kernel_L1_M8_END:

subs counterI, counterI, #1
bgt ctrmm_kernel_L1_M8_20
bgt .Lctrmm_kernel_L1_M8_20

ctrmm_kernel_L1_M4_BEGIN:
.Lctrmm_kernel_L1_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END

tst counterI, #4 // counterI = counterI / 2
ble ctrmm_kernel_L1_M2_BEGIN
ble .Lctrmm_kernel_L1_M2_BEGIN

ctrmm_kernel_L1_M4_20:
.Lctrmm_kernel_L1_M4_20:

INIT4x1

@@ -2319,10 +2319,10 @@ ctrmm_kernel_L1_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M4_40
ble .Lctrmm_kernel_L1_M4_40
.align 5

ctrmm_kernel_L1_M4_22:
.Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -2334,22 +2334,22 @@ ctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_22
bgt .Lctrmm_kernel_L1_M4_22


ctrmm_kernel_L1_M4_40:
.Lctrmm_kernel_L1_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M4_100
ble .Lctrmm_kernel_L1_M4_100

ctrmm_kernel_L1_M4_42:
.Lctrmm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_42
bgt .Lctrmm_kernel_L1_M4_42

ctrmm_kernel_L1_M4_100:
.Lctrmm_kernel_L1_M4_100:

SAVE4x1

@@ -2369,18 +2369,18 @@ ctrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif

ctrmm_kernel_L1_M4_END:
.Lctrmm_kernel_L1_M4_END:

ctrmm_kernel_L1_M2_BEGIN:
.Lctrmm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L1_M1_BEGIN
ble .Lctrmm_kernel_L1_M1_BEGIN

ctrmm_kernel_L1_M2_20:
.Lctrmm_kernel_L1_M2_20:

INIT2x1

@@ -2404,9 +2404,9 @@ ctrmm_kernel_L1_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M2_40
ble .Lctrmm_kernel_L1_M2_40

ctrmm_kernel_L1_M2_22:
.Lctrmm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -2419,22 +2419,22 @@ ctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_22
bgt .Lctrmm_kernel_L1_M2_22


ctrmm_kernel_L1_M2_40:
.Lctrmm_kernel_L1_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M2_100
ble .Lctrmm_kernel_L1_M2_100

ctrmm_kernel_L1_M2_42:
.Lctrmm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_42
bgt .Lctrmm_kernel_L1_M2_42

ctrmm_kernel_L1_M2_100:
.Lctrmm_kernel_L1_M2_100:

SAVE2x1

@@ -2454,15 +2454,15 @@ ctrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif

ctrmm_kernel_L1_M2_END:
.Lctrmm_kernel_L1_M2_END:


ctrmm_kernel_L1_M1_BEGIN:
.Lctrmm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END

ctrmm_kernel_L1_M1_20:
.Lctrmm_kernel_L1_M1_20:

INIT1x1

@@ -2486,9 +2486,9 @@ ctrmm_kernel_L1_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ctrmm_kernel_L1_M1_40
ble .Lctrmm_kernel_L1_M1_40

ctrmm_kernel_L1_M1_22:
.Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -2500,30 +2500,30 @@ ctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_22
bgt .Lctrmm_kernel_L1_M1_22


ctrmm_kernel_L1_M1_40:
.Lctrmm_kernel_L1_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M1_100
ble .Lctrmm_kernel_L1_M1_100

ctrmm_kernel_L1_M1_42:
.Lctrmm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_42
bgt .Lctrmm_kernel_L1_M1_42

ctrmm_kernel_L1_M1_100:
.Lctrmm_kernel_L1_M1_100:

SAVE1x1


ctrmm_kernel_L1_END:
.Lctrmm_kernel_L1_END:


ctrmm_kernel_L999:
.Lctrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 22
- 22
kernel/arm64/daxpy_thunderx2t99.S View File

@@ -122,53 +122,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

cmp N, xzr
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999

fcmp DA, #0.0
beq axpy_kernel_L999
beq .Ldaxpy_kernel_L999

cmp INC_X, #1
bne axpy_kernel_S_BEGIN
bne .Ldaxpy_kernel_S_BEGIN
cmp INC_Y, #1
bne axpy_kernel_S_BEGIN
bne .Ldaxpy_kernel_S_BEGIN

axpy_kernel_F_BEGIN:
.Ldaxpy_kernel_F_BEGIN:

asr I, N, #5
cmp I, xzr
beq axpy_kernel_F1
beq .Ldaxpy_kernel_F1

.align 5
axpy_kernel_F32:
.Ldaxpy_kernel_F32:

KERNEL_F32

subs I, I, #1
bne axpy_kernel_F32
bne .Ldaxpy_kernel_F32

axpy_kernel_F1:
.Ldaxpy_kernel_F1:

ands I, N, #31
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999

axpy_kernel_F10:
.Ldaxpy_kernel_F10:

KERNEL_F1

subs I, I, #1
bne axpy_kernel_F10
bne .Ldaxpy_kernel_F10

b axpy_kernel_L999
b .Ldaxpy_kernel_L999

axpy_kernel_S_BEGIN:
.Ldaxpy_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble axpy_kernel_S1
ble .Ldaxpy_kernel_S1

axpy_kernel_S4:
.Ldaxpy_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -176,21 +176,21 @@ axpy_kernel_S4:
KERNEL_S1

subs I, I, #1
bne axpy_kernel_S4
bne .Ldaxpy_kernel_S4

axpy_kernel_S1:
.Ldaxpy_kernel_S1:

ands I, N, #3
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999

axpy_kernel_S10:
.Ldaxpy_kernel_S10:

KERNEL_S1

subs I, I, #1
bne axpy_kernel_S10
bne .Ldaxpy_kernel_S10

axpy_kernel_L999:
.Ldaxpy_kernel_L999:

mov w0, wzr
ret

+ 143
- 143
kernel/arm64/dgemm_kernel_4x4.S View File

@@ -775,9 +775,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN

dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@@ -791,20 +791,20 @@ dgemm_kernel_L4_BEGIN:

//------------------------------------------------------------------------------

dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN

.align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:

mov pB, origPB
asr counterL , origK, #2 // L = K / 4
cmp counterL , #2
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32

KERNEL8x4_I
KERNEL8x4_M2
@@ -812,60 +812,60 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2

subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a

.align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22

.align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:

KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E

b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44

.align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:

tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40

KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E

b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44


dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:

INIT8x4

dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:

ands counterL , origK, #3
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100

.align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:

KERNEL8x4_SUB

subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46

dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
lsl temp, origK, #5
prfm PLDL1KEEP, [pA, temp]
prfm PLDL1KEEP, [ppA, temp]
@@ -873,31 +873,31 @@ dgemm_kernel_L4_M8_100:

SAVE8x4

dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp
add ppA, ppA, temp
subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20

dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END

tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN

dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:

INIT4x4

mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40

dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:

KERNEL4x4_SUB
KERNEL4x4_SUB
@@ -910,47 +910,47 @@ dgemm_kernel_L4_M4_22:
KERNEL4x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22


dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100

dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:

KERNEL4x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42

dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:

SAVE4x4

dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:


dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN

dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:

INIT2x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40

dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -963,43 +963,43 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22


dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100

dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42

dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:

SAVE2x4

dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:


dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END

dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:

INIT1x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40

dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1011,45 +1011,45 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22


dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100

dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42

dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:

SAVE1x4


dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:

lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8

subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN


/******************************************************************************/

dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1059,24 +1059,24 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction



dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN

dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:

INIT4x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5

dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1088,50 +1088,50 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22


dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100

dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42

dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:

SAVE4x2

dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:

subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20
bgt .Ldgemm_kernel_L2_M4_20


dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN

dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:

INIT2x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40

dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1144,43 +1144,43 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22


dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100

dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42

dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:

SAVE2x2

dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:


dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END

dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:

INIT1x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40

dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1192,36 +1192,36 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22


dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100

dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42

dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:

SAVE1x2


dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8

/******************************************************************************/

dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1231,24 +1231,24 @@ dgemm_kernel_L1_BEGIN:



dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN

dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:

INIT4x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5

dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1260,50 +1260,50 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22


dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100

dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42

dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:

SAVE4x1

dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:

subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20
bgt .Ldgemm_kernel_L1_M4_20


dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN

dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:

INIT2x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40

dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1316,43 +1316,43 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22


dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100

dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42

dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:

SAVE2x1

dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:


dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END

dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:

INIT1x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40

dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1364,30 +1364,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22


dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100

dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42

dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:

SAVE1x1


dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:


dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 176
- 176
kernel/arm64/dgemm_kernel_4x8.S View File

@@ -938,98 +938,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble dgemm_kernel_L4_BEGIN
ble .Ldgemm_kernel_L4_BEGIN

/******************************************************************************/

dgemm_kernel_L8_BEGIN:
.Ldgemm_kernel_L8_BEGIN:

mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3

mov pA, origPA // pA = start of A array

dgemm_kernel_L8_M4_BEGIN:
.Ldgemm_kernel_L8_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L8_M2_BEGIN
ble .Ldgemm_kernel_L8_M2_BEGIN

dgemm_kernel_L8_M4_20:
.Ldgemm_kernel_L8_M4_20:

mov pB, origPB

asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L8_M4_32
blt .Ldgemm_kernel_L8_M4_32

KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K

subs counterL, counterL, #2
ble dgemm_kernel_L8_M4_22a
ble .Ldgemm_kernel_L8_M4_22a
.align 5

dgemm_kernel_L8_M4_22:
.Ldgemm_kernel_L8_M4_22:

KERNEL4x8_M1
KERNEL4x8_M2

subs counterL, counterL, #1
bgt dgemm_kernel_L8_M4_22
bgt .Ldgemm_kernel_L8_M4_22


dgemm_kernel_L8_M4_22a:
.Ldgemm_kernel_L8_M4_22a:

KERNEL4x8_M1
KERNEL4x8_E

b dgemm_kernel_L8_M4_44
b .Ldgemm_kernel_L8_M4_44

dgemm_kernel_L8_M4_32:
.Ldgemm_kernel_L8_M4_32:

tst counterL, #1
ble dgemm_kernel_L8_M4_40
ble .Ldgemm_kernel_L8_M4_40

KERNEL4x8_I

KERNEL4x8_E

b dgemm_kernel_L8_M4_44
b .Ldgemm_kernel_L8_M4_44


dgemm_kernel_L8_M4_40:
.Ldgemm_kernel_L8_M4_40:

INIT4x8

dgemm_kernel_L8_M4_44:
.Ldgemm_kernel_L8_M4_44:

ands counterL , origK, #1
ble dgemm_kernel_L8_M4_100
ble .Ldgemm_kernel_L8_M4_100

dgemm_kernel_L8_M4_46:
.Ldgemm_kernel_L8_M4_46:

KERNEL4x8_SUB

dgemm_kernel_L8_M4_100:
.Ldgemm_kernel_L8_M4_100:

SAVE4x8

dgemm_kernel_L8_M4_END:
.Ldgemm_kernel_L8_M4_END:
subs counterI, counterI, #1
bne dgemm_kernel_L8_M4_20
bne .Ldgemm_kernel_L8_M4_20

dgemm_kernel_L8_M2_BEGIN:
.Ldgemm_kernel_L8_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L8_END
ble .Ldgemm_kernel_L8_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L8_M1_BEGIN
ble .Ldgemm_kernel_L8_M1_BEGIN

dgemm_kernel_L8_M2_20:
.Ldgemm_kernel_L8_M2_20:

INIT2x8

@@ -1037,9 +1037,9 @@ dgemm_kernel_L8_M2_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L8_M2_40
ble .Ldgemm_kernel_L8_M2_40

dgemm_kernel_L8_M2_22:
.Ldgemm_kernel_L8_M2_22:

KERNEL2x8_SUB
KERNEL2x8_SUB
@@ -1052,34 +1052,34 @@ dgemm_kernel_L8_M2_22:
KERNEL2x8_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L8_M2_22
bgt .Ldgemm_kernel_L8_M2_22


dgemm_kernel_L8_M2_40:
.Ldgemm_kernel_L8_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L8_M2_100
ble .Ldgemm_kernel_L8_M2_100

dgemm_kernel_L8_M2_42:
.Ldgemm_kernel_L8_M2_42:

KERNEL2x8_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L8_M2_42
bgt .Ldgemm_kernel_L8_M2_42

dgemm_kernel_L8_M2_100:
.Ldgemm_kernel_L8_M2_100:

SAVE2x8

dgemm_kernel_L8_M2_END:
.Ldgemm_kernel_L8_M2_END:


dgemm_kernel_L8_M1_BEGIN:
.Ldgemm_kernel_L8_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L8_END
ble .Ldgemm_kernel_L8_END

dgemm_kernel_L8_M1_20:
.Ldgemm_kernel_L8_M1_20:

INIT1x8

@@ -1087,9 +1087,9 @@ dgemm_kernel_L8_M1_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L8_M1_40
ble .Ldgemm_kernel_L8_M1_40

dgemm_kernel_L8_M1_22:
.Ldgemm_kernel_L8_M1_22:
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
@@ -1101,131 +1101,131 @@ dgemm_kernel_L8_M1_22:
KERNEL1x8_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L8_M1_22
bgt .Ldgemm_kernel_L8_M1_22


dgemm_kernel_L8_M1_40:
.Ldgemm_kernel_L8_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L8_M1_100
ble .Ldgemm_kernel_L8_M1_100

dgemm_kernel_L8_M1_42:
.Ldgemm_kernel_L8_M1_42:

KERNEL1x8_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L8_M1_42
bgt .Ldgemm_kernel_L8_M1_42

dgemm_kernel_L8_M1_100:
.Ldgemm_kernel_L8_M1_100:

SAVE1x8

dgemm_kernel_L8_END:
.Ldgemm_kernel_L8_END:

lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8

subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L8_BEGIN
bgt .Ldgemm_kernel_L8_BEGIN


/******************************************************************************/

dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:

mov counterJ , origN
tst counterJ , #7
ble dgemm_kernel_L999
ble .Ldgemm_kernel_L999

tst counterJ , #4
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN

mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2

mov pA, origPA // pA = start of A array

dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN

dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:

mov pB, origPB

asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M4_32
blt .Ldgemm_kernel_L4_M4_32

KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K

subs counterL, counterL, #2
ble dgemm_kernel_L4_M4_22a
ble .Ldgemm_kernel_L4_M4_22a
.align 5

dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:

KERNEL4x4_M1
KERNEL4x4_M2

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22


dgemm_kernel_L4_M4_22a:
.Ldgemm_kernel_L4_M4_22a:

KERNEL4x4_M1
KERNEL4x4_E

b dgemm_kernel_L4_M4_44
b .Ldgemm_kernel_L4_M4_44

dgemm_kernel_L4_M4_32:
.Ldgemm_kernel_L4_M4_32:

tst counterL, #1
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40

KERNEL4x4_I

KERNEL4x4_E

b dgemm_kernel_L4_M4_44
b .Ldgemm_kernel_L4_M4_44


dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:

INIT4x4

dgemm_kernel_L4_M4_44:
.Ldgemm_kernel_L4_M4_44:

ands counterL , origK, #1
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100

dgemm_kernel_L4_M4_46:
.Ldgemm_kernel_L4_M4_46:

KERNEL4x4_SUB

dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:

SAVE4x4

dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne dgemm_kernel_L4_M4_20
bne .Ldgemm_kernel_L4_M4_20

dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN

dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:

INIT2x4

@@ -1233,9 +1233,9 @@ dgemm_kernel_L4_M2_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40

dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -1248,34 +1248,34 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22


dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100

dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42

dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:

SAVE2x4

dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:


dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END

dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:

INIT1x4

@@ -1283,9 +1283,9 @@ dgemm_kernel_L4_M1_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40

dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1297,40 +1297,40 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22


dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100

dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42

dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:

SAVE1x4

dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:

lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8

/******************************************************************************/

dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1339,14 +1339,14 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A


dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN

dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:

INIT4x2

@@ -1354,10 +1354,10 @@ dgemm_kernel_L2_M4_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5

dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1369,41 +1369,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22


dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100

dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42

dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:

SAVE4x2

dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:

subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20
bgt .Ldgemm_kernel_L2_M4_20


dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN

dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:

INIT2x2

@@ -1411,9 +1411,9 @@ dgemm_kernel_L2_M2_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40

dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1426,34 +1426,34 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22


dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100

dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42

dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:

SAVE2x2

dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:


dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END

dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:

INIT1x2

@@ -1461,9 +1461,9 @@ dgemm_kernel_L2_M1_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40

dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1475,35 +1475,35 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22


dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100

dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42

dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:

SAVE1x2

dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8

/******************************************************************************/

dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1511,24 +1511,24 @@ dgemm_kernel_L1_BEGIN:

mov pA, origPA // pA = A

dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN

dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:

INIT4x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5

dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1540,41 +1540,41 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22


dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100

dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42

dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:

SAVE4x1

dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:

subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20
bgt .Ldgemm_kernel_L1_M4_20


dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN

dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:

INIT2x1

@@ -1582,9 +1582,9 @@ dgemm_kernel_L1_M2_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40

dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1597,34 +1597,34 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22


dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100

dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42

dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:

SAVE2x1

dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:


dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END

dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:

INIT1x1

@@ -1632,9 +1632,9 @@ dgemm_kernel_L1_M1_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40

dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1646,30 +1646,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22


dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100

dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42

dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:

SAVE1x1


dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:


dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 169
- 169
kernel/arm64/dgemm_kernel_8x4.S View File

@@ -885,12 +885,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN

/******************************************************************************/

.align 5
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@@ -900,21 +900,21 @@ dgemm_kernel_L4_BEGIN:

mov pA, origPA // pA = start of A array

dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN

.align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:

mov pB, origPB

asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32

KERNEL8x4_I
KERNEL8x4_M2
@@ -926,10 +926,10 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2

subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a

.align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:

KERNEL8x4_M1
KERNEL8x4_M2
@@ -941,10 +941,10 @@ dgemm_kernel_L4_M8_22:
KERNEL8x4_M2

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22

.align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:

KERNEL8x4_M1
KERNEL8x4_M2
@@ -955,13 +955,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E

b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44

.align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:

tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40

KERNEL8x4_I
KERNEL8x4_M2
@@ -972,46 +972,46 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E

b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44

dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:

INIT8x4

dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:

ands counterL , origK, #7
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100

.align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:

KERNEL8x4_SUB

subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46

dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]

SAVE8x4

dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20

dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END

tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN

dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:

INIT4x4

@@ -1019,10 +1019,10 @@ dgemm_kernel_L4_M4_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40

.align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:

KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1043,38 +1043,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22

dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100

dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:

KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42

dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:

SAVE4x4

dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:

dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN

dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:

INIT2x4

@@ -1082,10 +1082,10 @@ dgemm_kernel_L4_M2_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40

.align 5
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:

KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1104,37 +1104,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22


dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100

prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:

KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42

dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:

SAVE2x4

dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:


dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END

dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:

INIT1x4

@@ -1142,10 +1142,10 @@ dgemm_kernel_L4_M1_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40

.align 5
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
@@ -1163,46 +1163,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22


dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100

prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:

KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42

dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:

SAVE1x4

dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:

lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8

subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN


/******************************************************************************/

dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN

mov pCRow0, pC
add pCRow1, pCRow0, LDC
@@ -1211,15 +1211,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction

mov pA, origPA // pA = A

dgemm_kernel_L2_M8_BEGIN:
.Ldgemm_kernel_L2_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
ble .Ldgemm_kernel_L2_M4_BEGIN

.align 5
dgemm_kernel_L2_M8_20:
.Ldgemm_kernel_L2_M8_20:

INIT8x2

@@ -1227,10 +1227,10 @@ dgemm_kernel_L2_M8_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M8_40
ble .Ldgemm_kernel_L2_M8_40

.align 5
dgemm_kernel_L2_M8_22:
.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1244,41 +1244,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
bgt .Ldgemm_kernel_L2_M8_22

dgemm_kernel_L2_M8_40:
.Ldgemm_kernel_L2_M8_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
ble .Ldgemm_kernel_L2_M8_100

prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M8_42:
.Ldgemm_kernel_L2_M8_42:

KERNEL8x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42
bgt .Ldgemm_kernel_L2_M8_42

dgemm_kernel_L2_M8_100:
.Ldgemm_kernel_L2_M8_100:

SAVE8x2

dgemm_kernel_L2_M8_END:
.Ldgemm_kernel_L2_M8_END:

subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20
bgt .Ldgemm_kernel_L2_M8_20

dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END

tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN

dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:

INIT4x2

@@ -1286,10 +1286,10 @@ dgemm_kernel_L2_M4_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40

.align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
@@ -1307,41 +1307,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22


dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100

prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:

KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42

dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:

SAVE4x2

dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:


dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN

dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:

INIT2x2

@@ -1349,9 +1349,9 @@ dgemm_kernel_L2_M2_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40

dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:

KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1368,37 +1368,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22

prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100

dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42

dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:

SAVE2x2

dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:


dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END

dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:

INIT1x2

@@ -1406,9 +1406,9 @@ dgemm_kernel_L2_M1_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40

dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1424,62 +1424,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22

prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100

dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42

dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:

SAVE1x2

dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8

/******************************************************************************/

dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done

mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next

mov pA, origPA // pA = A

dgemm_kernel_L1_M8_BEGIN:
.Ldgemm_kernel_L1_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
ble .Ldgemm_kernel_L1_M4_BEGIN

.align 5
dgemm_kernel_L1_M8_20:
.Ldgemm_kernel_L1_M8_20:

INIT8x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M8_40
ble .Ldgemm_kernel_L1_M8_40

.align 5
dgemm_kernel_L1_M8_22:
.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@@ -1493,51 +1493,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22
bgt .Ldgemm_kernel_L1_M8_22


dgemm_kernel_L1_M8_40:
.Ldgemm_kernel_L1_M8_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
ble .Ldgemm_kernel_L1_M8_100

prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
.Ldgemm_kernel_L1_M8_42:

KERNEL8x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42
bgt .Ldgemm_kernel_L1_M8_42

dgemm_kernel_L1_M8_100:
.Ldgemm_kernel_L1_M8_100:

SAVE8x1

dgemm_kernel_L1_M8_END:
.Ldgemm_kernel_L1_M8_END:

subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20
bgt .Ldgemm_kernel_L1_M8_20

dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END

tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN

dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:

INIT4x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40

.align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
@@ -1555,39 +1555,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22


dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100

prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:

KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42

dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:

SAVE4x1

dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:

dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN

dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:

INIT2x1

@@ -1595,9 +1595,9 @@ dgemm_kernel_L1_M2_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40

dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1614,36 +1614,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22

prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100

dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42

dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:

SAVE2x1

dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:


dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END

dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:

INIT1x1

@@ -1651,10 +1651,10 @@ dgemm_kernel_L1_M1_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40


dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
@@ -1668,32 +1668,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22


dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100

prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42

dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:

SAVE1x1


dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:


dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 169
- 169
kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S View File

@@ -962,12 +962,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN

/******************************************************************************/

.align 5
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@@ -977,21 +977,21 @@ dgemm_kernel_L4_BEGIN:

mov pA, origPA // pA = start of A array

dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN

.align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:

mov pB, origPB

asr counterL , origK, #7 // L = K / 128
cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32

KERNEL8x4_I
KERNEL8x4_M2
@@ -1003,18 +1003,18 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M1_M2_x1

subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a

.align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:

KERNEL8x4_M1_M2_x64

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22

.align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:

KERNEL8x4_M1_M2_x32
KERNEL8x4_M1_M2_x16
@@ -1025,13 +1025,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E

b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44

.align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:

tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40

KERNEL8x4_I
KERNEL8x4_M2
@@ -1043,26 +1043,26 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E

b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44

dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:

INIT8x4

dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:

ands counterL , origK, #127
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100

.align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:

KERNEL8x4_SUB

subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46

dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE]
@@ -1073,20 +1073,20 @@ dgemm_kernel_L4_M8_100:

SAVE8x4

dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20

dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END

tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN

dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:

INIT4x4

@@ -1094,10 +1094,10 @@ dgemm_kernel_L4_M4_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40

.align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:

KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1118,38 +1118,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, A_PRE_SIZE]

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22

dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100

dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:

KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42

dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:

SAVE4x4

dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:

dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN

dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:

INIT2x4

@@ -1157,10 +1157,10 @@ dgemm_kernel_L4_M2_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40

.align 5
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:

KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1179,37 +1179,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22


dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100

prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:

KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42

dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:

SAVE2x4

dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:


dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END

dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:

INIT1x4

@@ -1217,10 +1217,10 @@ dgemm_kernel_L4_M1_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40

.align 5
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
KERNEL1x4_SUB
@@ -1238,46 +1238,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22


dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100

prfm PLDL1KEEP, [pA, A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:

KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]

subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42

dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:

SAVE1x4

dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:

lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8

subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN


/******************************************************************************/

dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN

mov pCRow0, pC
add pCRow1, pCRow0, LDC
@@ -1286,15 +1286,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction

mov pA, origPA // pA = A

dgemm_kernel_L2_M8_BEGIN:
.Ldgemm_kernel_L2_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
ble .Ldgemm_kernel_L2_M4_BEGIN

.align 5
dgemm_kernel_L2_M8_20:
.Ldgemm_kernel_L2_M8_20:

INIT8x2

@@ -1302,10 +1302,10 @@ dgemm_kernel_L2_M8_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M8_40
ble .Ldgemm_kernel_L2_M8_40

.align 5
dgemm_kernel_L2_M8_22:
.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1319,41 +1319,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
bgt .Ldgemm_kernel_L2_M8_22

dgemm_kernel_L2_M8_40:
.Ldgemm_kernel_L2_M8_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
ble .Ldgemm_kernel_L2_M8_100

prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M8_42:
.Ldgemm_kernel_L2_M8_42:

KERNEL8x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42
bgt .Ldgemm_kernel_L2_M8_42

dgemm_kernel_L2_M8_100:
.Ldgemm_kernel_L2_M8_100:

SAVE8x2

dgemm_kernel_L2_M8_END:
.Ldgemm_kernel_L2_M8_END:

subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20
bgt .Ldgemm_kernel_L2_M8_20

dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END

tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN

dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:

INIT4x2

@@ -1361,10 +1361,10 @@ dgemm_kernel_L2_M4_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40

.align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x2_SUB
@@ -1382,41 +1382,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22


dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100

prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:

KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42

dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:

SAVE4x2

dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:


dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN

dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:

INIT2x2

@@ -1424,9 +1424,9 @@ dgemm_kernel_L2_M2_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40

dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:

KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1443,37 +1443,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22

prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100

dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42

dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:

SAVE2x2

dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:


dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END

dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:

INIT1x2

@@ -1481,9 +1481,9 @@ dgemm_kernel_L2_M1_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40

dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1499,62 +1499,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22

prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100

dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42

dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:

SAVE1x2

dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8

/******************************************************************************/

dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done

mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next

mov pA, origPA // pA = A

dgemm_kernel_L1_M8_BEGIN:
.Ldgemm_kernel_L1_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
ble .Ldgemm_kernel_L1_M4_BEGIN

.align 5
dgemm_kernel_L1_M8_20:
.Ldgemm_kernel_L1_M8_20:

INIT8x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M8_40
ble .Ldgemm_kernel_L1_M8_40

.align 5
dgemm_kernel_L1_M8_22:
.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@@ -1568,51 +1568,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22
bgt .Ldgemm_kernel_L1_M8_22


dgemm_kernel_L1_M8_40:
.Ldgemm_kernel_L1_M8_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
ble .Ldgemm_kernel_L1_M8_100

prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
.Ldgemm_kernel_L1_M8_42:

KERNEL8x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42
bgt .Ldgemm_kernel_L1_M8_42

dgemm_kernel_L1_M8_100:
.Ldgemm_kernel_L1_M8_100:

SAVE8x1

dgemm_kernel_L1_M8_END:
.Ldgemm_kernel_L1_M8_END:

subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20
bgt .Ldgemm_kernel_L1_M8_20

dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END

tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN

dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:

INIT4x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40

.align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x1_SUB
@@ -1630,39 +1630,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22


dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100

prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:

KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42

dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:

SAVE4x1

dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:

dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN

dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:

INIT2x1

@@ -1670,9 +1670,9 @@ dgemm_kernel_L1_M2_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40

dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1689,36 +1689,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22

prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100

dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42

dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:

SAVE2x1

dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:


dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END

dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:

INIT1x1

@@ -1726,10 +1726,10 @@ dgemm_kernel_L1_M1_20:

asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40


dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
@@ -1743,32 +1743,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22


dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100

prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42

dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:

SAVE1x1


dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:


dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 36
- 36
kernel/arm64/dgemm_ncopy_4.S View File

@@ -192,14 +192,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

lsl LDA, LDA, #3 // LDA = LDA * SIZE

dgemm_ncopy_L4_BEGIN:
.Ldgemm_ncopy_L4_BEGIN:

asr J, N, #2 // J = N / 4
cmp J, #0
ble dgemm_ncopy_L2_BEGIN
ble .Ldgemm_ncopy_L2_BEGIN

.align 5
dgemm_ncopy_L4_M4_BEGIN:
.Ldgemm_ncopy_L4_M4_BEGIN:

mov A01, A00
add A02, A01, LDA
@@ -209,128 +209,128 @@ dgemm_ncopy_L4_M4_BEGIN:

asr I, M, #2 // I = M / 4
cmp I, #0
ble dgemm_ncopy_L4_M4_40
ble .Ldgemm_ncopy_L4_M4_40

.align 5
dgemm_ncopy_L4_M4_20:
.Ldgemm_ncopy_L4_M4_20:

COPY4x4

subs I , I , #1
bne dgemm_ncopy_L4_M4_20
bne .Ldgemm_ncopy_L4_M4_20


dgemm_ncopy_L4_M4_40:
.Ldgemm_ncopy_L4_M4_40:

and I, M , #3
cmp I, #0
ble dgemm_ncopy_L4_M4_END
ble .Ldgemm_ncopy_L4_M4_END

.align 5
dgemm_ncopy_L4_M4_60:
.Ldgemm_ncopy_L4_M4_60:

COPY1x4

subs I , I , #1
bne dgemm_ncopy_L4_M4_60
bne .Ldgemm_ncopy_L4_M4_60


dgemm_ncopy_L4_M4_END:
.Ldgemm_ncopy_L4_M4_END:

subs J , J, #1 // j--
bne dgemm_ncopy_L4_M4_BEGIN
bne .Ldgemm_ncopy_L4_M4_BEGIN



/*********************************************************************************************/

dgemm_ncopy_L2_BEGIN:
.Ldgemm_ncopy_L2_BEGIN:

tst N, #3
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999

tst N, #2
ble dgemm_ncopy_L1_BEGIN
ble .Ldgemm_ncopy_L1_BEGIN

dgemm_ncopy_L2_M4_BEGIN:
.Ldgemm_ncopy_L2_M4_BEGIN:
mov A01, A00
add A02, A01, LDA
add A00, A02, LDA

asr I, M, #2 // I = M / 4
cmp I, #0
ble dgemm_ncopy_L2_M4_40
ble .Ldgemm_ncopy_L2_M4_40

.align 5
dgemm_ncopy_L2_M4_20:
.Ldgemm_ncopy_L2_M4_20:

COPY4x2

subs I , I , #1
bne dgemm_ncopy_L2_M4_20
bne .Ldgemm_ncopy_L2_M4_20


dgemm_ncopy_L2_M4_40:
.Ldgemm_ncopy_L2_M4_40:

and I, M , #3
cmp I, #0
ble dgemm_ncopy_L2_M4_END
ble .Ldgemm_ncopy_L2_M4_END

.align 5
dgemm_ncopy_L2_M4_60:
.Ldgemm_ncopy_L2_M4_60:

COPY1x2

subs I , I , #1
bne dgemm_ncopy_L2_M4_60
bne .Ldgemm_ncopy_L2_M4_60


dgemm_ncopy_L2_M4_END:
.Ldgemm_ncopy_L2_M4_END:


/*********************************************************************************************/

dgemm_ncopy_L1_BEGIN:
.Ldgemm_ncopy_L1_BEGIN:

tst N, #1
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999


dgemm_ncopy_L1_M4_BEGIN:
.Ldgemm_ncopy_L1_M4_BEGIN:

mov A01, A00

asr I, M, #2 // I = M / 4
cmp I, #0
ble dgemm_ncopy_L1_M4_40
ble .Ldgemm_ncopy_L1_M4_40

.align 5
dgemm_ncopy_L1_M4_20:
.Ldgemm_ncopy_L1_M4_20:

COPY4x1

subs I , I , #1
bne dgemm_ncopy_L1_M4_20
bne .Ldgemm_ncopy_L1_M4_20


dgemm_ncopy_L1_M4_40:
.Ldgemm_ncopy_L1_M4_40:

and I, M , #3
cmp I, #0
ble dgemm_ncopy_L1_M4_END
ble .Ldgemm_ncopy_L1_M4_END

.align 5
dgemm_ncopy_L1_M4_60:
.Ldgemm_ncopy_L1_M4_60:

COPY1x1

subs I , I , #1
bne dgemm_ncopy_L1_M4_60
bne .Ldgemm_ncopy_L1_M4_60


dgemm_ncopy_L1_M4_END:
.Ldgemm_ncopy_L1_M4_END:

dgemm_ncopy_L999:
.Ldgemm_ncopy_L999:

mov x0, #0
RESTORE_REGS


+ 48
- 48
kernel/arm64/dgemm_ncopy_8.S View File

@@ -353,13 +353,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

lsl LDA, LDA, #3 // LDA = LDA * SIZE

dgemm_ncopy_L8_BEGIN:
.Ldgemm_ncopy_L8_BEGIN:

asr J, N, #3 // J = N / 8
cmp J, #0
ble dgemm_ncopy_L4_BEGIN
ble .Ldgemm_ncopy_L4_BEGIN

dgemm_ncopy_L8_M8_BEGIN:
.Ldgemm_ncopy_L8_M8_BEGIN:

mov A01, A00
add A02, A01, LDA
@@ -374,46 +374,46 @@ dgemm_ncopy_L8_M8_BEGIN:

asr I, M, #3 // I = M / 8
cmp I, #0
ble dgemm_ncopy_L8_M8_40
ble .Ldgemm_ncopy_L8_M8_40

dgemm_ncopy_L8_M8_20:
.Ldgemm_ncopy_L8_M8_20:

COPY8x8

subs I , I , #1
bne dgemm_ncopy_L8_M8_20
bne .Ldgemm_ncopy_L8_M8_20


dgemm_ncopy_L8_M8_40:
.Ldgemm_ncopy_L8_M8_40:

and I, M , #7
cmp I, #0
ble dgemm_ncopy_L8_M8_END
ble .Ldgemm_ncopy_L8_M8_END

dgemm_ncopy_L8_M8_60:
.Ldgemm_ncopy_L8_M8_60:

COPY1x8

subs I , I , #1
bne dgemm_ncopy_L8_M8_60
bne .Ldgemm_ncopy_L8_M8_60


dgemm_ncopy_L8_M8_END:
.Ldgemm_ncopy_L8_M8_END:

subs J , J, #1 // j--
bne dgemm_ncopy_L8_M8_BEGIN
bne .Ldgemm_ncopy_L8_M8_BEGIN

/*********************************************************************************************/

dgemm_ncopy_L4_BEGIN:
.Ldgemm_ncopy_L4_BEGIN:

tst N, #7
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999

tst N, #4
ble dgemm_ncopy_L2_BEGIN
ble .Ldgemm_ncopy_L2_BEGIN

dgemm_ncopy_L4_M8_BEGIN:
.Ldgemm_ncopy_L4_M8_BEGIN:

mov A01, A00
add A02, A01, LDA
@@ -423,118 +423,118 @@ dgemm_ncopy_L4_M8_BEGIN:

asr I, M, #3 // I = M / 8
cmp I, #0
ble dgemm_ncopy_L4_M8_40
ble .Ldgemm_ncopy_L4_M8_40

dgemm_ncopy_L4_M8_20:
.Ldgemm_ncopy_L4_M8_20:

COPY8x4

subs I , I , #1
bne dgemm_ncopy_L4_M8_20
bne .Ldgemm_ncopy_L4_M8_20


dgemm_ncopy_L4_M8_40:
.Ldgemm_ncopy_L4_M8_40:

and I, M , #7
cmp I, #0
ble dgemm_ncopy_L4_M8_END
ble .Ldgemm_ncopy_L4_M8_END

dgemm_ncopy_L4_M8_60:
.Ldgemm_ncopy_L4_M8_60:

COPY1x4

subs I , I , #1
bne dgemm_ncopy_L4_M8_60
bne .Ldgemm_ncopy_L4_M8_60


dgemm_ncopy_L4_M8_END:
.Ldgemm_ncopy_L4_M8_END:


/*********************************************************************************************/

dgemm_ncopy_L2_BEGIN:
.Ldgemm_ncopy_L2_BEGIN:

tst N, #3
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999

tst N, #2
ble dgemm_ncopy_L1_BEGIN
ble .Ldgemm_ncopy_L1_BEGIN

dgemm_ncopy_L2_M8_BEGIN:
.Ldgemm_ncopy_L2_M8_BEGIN:
mov A01, A00
add A02, A01, LDA
add A00, A02, LDA

asr I, M, #3 // I = M / 8
cmp I, #0
ble dgemm_ncopy_L2_M8_40
ble .Ldgemm_ncopy_L2_M8_40

dgemm_ncopy_L2_M8_20:
.Ldgemm_ncopy_L2_M8_20:

COPY8x2

subs I , I , #1
bne dgemm_ncopy_L2_M8_20
bne .Ldgemm_ncopy_L2_M8_20


dgemm_ncopy_L2_M8_40:
.Ldgemm_ncopy_L2_M8_40:

and I, M , #7
cmp I, #0
ble dgemm_ncopy_L2_M8_END
ble .Ldgemm_ncopy_L2_M8_END

dgemm_ncopy_L2_M8_60:
.Ldgemm_ncopy_L2_M8_60:

COPY1x2

subs I , I , #1
bne dgemm_ncopy_L2_M8_60
bne .Ldgemm_ncopy_L2_M8_60


dgemm_ncopy_L2_M8_END:
.Ldgemm_ncopy_L2_M8_END:


/*********************************************************************************************/

dgemm_ncopy_L1_BEGIN:
.Ldgemm_ncopy_L1_BEGIN:

tst N, #1
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999


dgemm_ncopy_L1_M8_BEGIN:
.Ldgemm_ncopy_L1_M8_BEGIN:

mov A01, A00

asr I, M, #3 // I = M / 8
cmp I, #0
ble dgemm_ncopy_L1_M8_40
ble .Ldgemm_ncopy_L1_M8_40

dgemm_ncopy_L1_M8_20:
.Ldgemm_ncopy_L1_M8_20:

COPY8x1

subs I , I , #1
bne dgemm_ncopy_L1_M8_20
bne .Ldgemm_ncopy_L1_M8_20


dgemm_ncopy_L1_M8_40:
.Ldgemm_ncopy_L1_M8_40:

and I, M , #7
cmp I, #0
ble dgemm_ncopy_L1_M8_END
ble .Ldgemm_ncopy_L1_M8_END

dgemm_ncopy_L1_M8_60:
.Ldgemm_ncopy_L1_M8_60:

COPY1x1

subs I , I , #1
bne dgemm_ncopy_L1_M8_60
bne .Ldgemm_ncopy_L1_M8_60


dgemm_ncopy_L1_M8_END:
.Ldgemm_ncopy_L1_M8_END:

dgemm_ncopy_L999:
.Ldgemm_ncopy_L999:

mov x0, #0
RESTORE_REGS


+ 36
- 36
kernel/arm64/dgemm_tcopy_4.S View File

@@ -247,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

lsl M4, M, #5 // M4 = M * 4 * SIZE

dgemm_tcopy_L4_BEGIN:
.Ldgemm_tcopy_L4_BEGIN:
asr J, M, #2 // J = M / 4
cmp J, #0
ble dgemm_tcopy_L2_BEGIN
ble .Ldgemm_tcopy_L2_BEGIN

.align 5
dgemm_tcopy_L4_M4_BEGIN:
.Ldgemm_tcopy_L4_M4_BEGIN:

mov A01, A
add A02, A01, LDA
@@ -266,51 +266,51 @@ dgemm_tcopy_L4_M4_BEGIN:

asr I, N, #2 // I = N / 4
cmp I, #0
ble dgemm_tcopy_L4_M4_40
ble .Ldgemm_tcopy_L4_M4_40

.align 5
dgemm_tcopy_L4_M4_20:
.Ldgemm_tcopy_L4_M4_20:

COPY4x4

subs I , I , #1
bne dgemm_tcopy_L4_M4_20
bne .Ldgemm_tcopy_L4_M4_20


dgemm_tcopy_L4_M4_40:
.Ldgemm_tcopy_L4_M4_40:

tst N , #2
ble dgemm_tcopy_L4_M4_60
ble .Ldgemm_tcopy_L4_M4_60

COPY2x4


dgemm_tcopy_L4_M4_60:
.Ldgemm_tcopy_L4_M4_60:

tst N, #1
ble dgemm_tcopy_L4_M4_END
ble .Ldgemm_tcopy_L4_M4_END

COPY1x4


dgemm_tcopy_L4_M4_END:
.Ldgemm_tcopy_L4_M4_END:

subs J , J, #1 // j--
bne dgemm_tcopy_L4_M4_BEGIN
bne .Ldgemm_tcopy_L4_M4_BEGIN



/*********************************************************************************************/

dgemm_tcopy_L2_BEGIN:
.Ldgemm_tcopy_L2_BEGIN:

tst M, #3
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999

tst M, #2
ble dgemm_tcopy_L1_BEGIN
ble .Ldgemm_tcopy_L1_BEGIN

dgemm_tcopy_L2_M4_BEGIN:
.Ldgemm_tcopy_L2_M4_BEGIN:
mov A01, A
add A02, A01, LDA
add A, A02, LDA
@@ -320,80 +320,80 @@ dgemm_tcopy_L2_M4_BEGIN:

asr I, N, #2 // I = N / 4
cmp I, #0
ble dgemm_tcopy_L2_M4_40
ble .Ldgemm_tcopy_L2_M4_40

.align 5
dgemm_tcopy_L2_M4_20:
.Ldgemm_tcopy_L2_M4_20:

COPY4x2

subs I , I , #1
bne dgemm_tcopy_L2_M4_20
bne .Ldgemm_tcopy_L2_M4_20


dgemm_tcopy_L2_M4_40:
.Ldgemm_tcopy_L2_M4_40:

tst N , #2
ble dgemm_tcopy_L2_M4_60
ble .Ldgemm_tcopy_L2_M4_60

COPY2x2

dgemm_tcopy_L2_M4_60:
.Ldgemm_tcopy_L2_M4_60:

tst N , #1
ble dgemm_tcopy_L2_M4_END
ble .Ldgemm_tcopy_L2_M4_END

COPY1x2


dgemm_tcopy_L2_M4_END:
.Ldgemm_tcopy_L2_M4_END:


/*********************************************************************************************/

dgemm_tcopy_L1_BEGIN:
.Ldgemm_tcopy_L1_BEGIN:

tst M, #1
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999


dgemm_tcopy_L1_M4_BEGIN:
.Ldgemm_tcopy_L1_M4_BEGIN:

mov A01, A // A01 = A
mov B01, B

asr I, N, #2 // I = M / 4
cmp I, #0
ble dgemm_tcopy_L1_M4_40
ble .Ldgemm_tcopy_L1_M4_40

.align 5
dgemm_tcopy_L1_M4_20:
.Ldgemm_tcopy_L1_M4_20:

COPY4x1

subs I , I , #1
bne dgemm_tcopy_L1_M4_20
bne .Ldgemm_tcopy_L1_M4_20


dgemm_tcopy_L1_M4_40:
.Ldgemm_tcopy_L1_M4_40:

tst N , #2
ble dgemm_tcopy_L1_M4_60
ble .Ldgemm_tcopy_L1_M4_60

COPY2x1

dgemm_tcopy_L1_M4_60:
.Ldgemm_tcopy_L1_M4_60:

tst N , #1
ble dgemm_tcopy_L1_M4_END
ble .Ldgemm_tcopy_L1_M4_END

COPY1x1


dgemm_tcopy_L1_M4_END:
.Ldgemm_tcopy_L1_M4_END:


dgemm_tcopy_L999:
.Ldgemm_tcopy_L999:
mov x0, #0 // set return value
RESTORE_REGS
ret


+ 56
- 56
kernel/arm64/dgemm_tcopy_8.S View File

@@ -454,13 +454,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

lsl M8, M, #6 // M8 = M * 8 * SIZE

dgemm_tcopy_L8_BEGIN:
.Ldgemm_tcopy_L8_BEGIN:
asr J, M, #3 // J = M / 4
cmp J, #0
ble dgemm_tcopy_L4_BEGIN
ble .Ldgemm_tcopy_L4_BEGIN

.align 5
dgemm_tcopy_L8_M8_BEGIN:
.Ldgemm_tcopy_L8_M8_BEGIN:

mov A01, A
add A02, A01, LDA
@@ -477,53 +477,53 @@ dgemm_tcopy_L8_M8_BEGIN:

asr I, N, #3 // I = N / 8
cmp I, #0
ble dgemm_tcopy_L8_M8_40
ble .Ldgemm_tcopy_L8_M8_40

.align 5
dgemm_tcopy_L8_M8_20:
.Ldgemm_tcopy_L8_M8_20:

COPY8x8

subs I , I , #1
bne dgemm_tcopy_L8_M8_20
bne .Ldgemm_tcopy_L8_M8_20

dgemm_tcopy_L8_M8_40:
.Ldgemm_tcopy_L8_M8_40:
tst N , #4
ble dgemm_tcopy_L8_M8_60
ble .Ldgemm_tcopy_L8_M8_60

COPY4x8

dgemm_tcopy_L8_M8_60:
.Ldgemm_tcopy_L8_M8_60:

tst N , #2
ble dgemm_tcopy_L8_M8_80
ble .Ldgemm_tcopy_L8_M8_80

COPY2x8


dgemm_tcopy_L8_M8_80:
.Ldgemm_tcopy_L8_M8_80:

tst N, #1
ble dgemm_tcopy_L8_M8_END
ble .Ldgemm_tcopy_L8_M8_END

COPY1x8


dgemm_tcopy_L8_M8_END:
.Ldgemm_tcopy_L8_M8_END:

subs J , J, #1 // j--
bne dgemm_tcopy_L8_M8_BEGIN
bne .Ldgemm_tcopy_L8_M8_BEGIN

/*********************************************************************************************/

dgemm_tcopy_L4_BEGIN:
.Ldgemm_tcopy_L4_BEGIN:
tst M, #7
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999

tst M, #4
ble dgemm_tcopy_L2_BEGIN
ble .Ldgemm_tcopy_L2_BEGIN

dgemm_tcopy_L4_M8_BEGIN:
.Ldgemm_tcopy_L4_M8_BEGIN:

mov A01, A
add A02, A01, LDA
@@ -536,51 +536,51 @@ dgemm_tcopy_L4_M8_BEGIN:

asr I, N, #3 // I = N / 8
cmp I, #0
ble dgemm_tcopy_L4_M8_40
ble .Ldgemm_tcopy_L4_M8_40

.align 5
dgemm_tcopy_L4_M8_20:
.Ldgemm_tcopy_L4_M8_20:

COPY8x4

subs I , I , #1
bne dgemm_tcopy_L4_M8_20
bne .Ldgemm_tcopy_L4_M8_20

dgemm_tcopy_L4_M8_40:
.Ldgemm_tcopy_L4_M8_40:
tst N , #4
ble dgemm_tcopy_L4_M8_60
ble .Ldgemm_tcopy_L4_M8_60

COPY4x4

dgemm_tcopy_L4_M8_60:
.Ldgemm_tcopy_L4_M8_60:

tst N , #2
ble dgemm_tcopy_L4_M8_80
ble .Ldgemm_tcopy_L4_M8_80

COPY2x4


dgemm_tcopy_L4_M8_80:
.Ldgemm_tcopy_L4_M8_80:

tst N, #1
ble dgemm_tcopy_L4_M8_END
ble .Ldgemm_tcopy_L4_M8_END

COPY1x4


dgemm_tcopy_L4_M8_END:
.Ldgemm_tcopy_L4_M8_END:

/*********************************************************************************************/

dgemm_tcopy_L2_BEGIN:
.Ldgemm_tcopy_L2_BEGIN:

tst M, #3
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999

tst M, #2
ble dgemm_tcopy_L1_BEGIN
ble .Ldgemm_tcopy_L1_BEGIN

dgemm_tcopy_L2_M8_BEGIN:
.Ldgemm_tcopy_L2_M8_BEGIN:
mov A01, A
add A02, A01, LDA
add A, A02, LDA
@@ -590,90 +590,90 @@ dgemm_tcopy_L2_M8_BEGIN:

asr I, N, #3 // I = N / 8
cmp I, #0
ble dgemm_tcopy_L2_M8_40
ble .Ldgemm_tcopy_L2_M8_40

.align 5
dgemm_tcopy_L2_M8_20:
.Ldgemm_tcopy_L2_M8_20:

COPY8x2

subs I , I , #1
bne dgemm_tcopy_L2_M8_20
bne .Ldgemm_tcopy_L2_M8_20

dgemm_tcopy_L2_M8_40:
.Ldgemm_tcopy_L2_M8_40:
tst N , #4
ble dgemm_tcopy_L2_M8_60
ble .Ldgemm_tcopy_L2_M8_60

COPY4x2

dgemm_tcopy_L2_M8_60:
.Ldgemm_tcopy_L2_M8_60:

tst N , #2
ble dgemm_tcopy_L2_M8_80
ble .Ldgemm_tcopy_L2_M8_80

COPY2x2

dgemm_tcopy_L2_M8_80:
.Ldgemm_tcopy_L2_M8_80:

tst N , #1
ble dgemm_tcopy_L2_M8_END
ble .Ldgemm_tcopy_L2_M8_END

COPY1x2


dgemm_tcopy_L2_M8_END:
.Ldgemm_tcopy_L2_M8_END:


/*********************************************************************************************/

dgemm_tcopy_L1_BEGIN:
.Ldgemm_tcopy_L1_BEGIN:

tst M, #1
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999


dgemm_tcopy_L1_M8_BEGIN:
.Ldgemm_tcopy_L1_M8_BEGIN:

mov A01, A // A01 = A
mov B01, B

asr I, N, #3 // I = M / 8
cmp I, #0
ble dgemm_tcopy_L1_M8_40
ble .Ldgemm_tcopy_L1_M8_40

.align 5
dgemm_tcopy_L1_M8_20:
.Ldgemm_tcopy_L1_M8_20:

COPY8x1

subs I , I , #1
bne dgemm_tcopy_L1_M8_20
bne .Ldgemm_tcopy_L1_M8_20

dgemm_tcopy_L1_M8_40:
.Ldgemm_tcopy_L1_M8_40:
tst N , #4
ble dgemm_tcopy_L1_M8_60
ble .Ldgemm_tcopy_L1_M8_60

COPY4x1

dgemm_tcopy_L1_M8_60:
.Ldgemm_tcopy_L1_M8_60:

tst N , #2
ble dgemm_tcopy_L1_M8_80
ble .Ldgemm_tcopy_L1_M8_80

COPY2x1

dgemm_tcopy_L1_M8_80:
.Ldgemm_tcopy_L1_M8_80:

tst N , #1
ble dgemm_tcopy_L1_M8_END
ble .Ldgemm_tcopy_L1_M8_END

COPY1x1


dgemm_tcopy_L1_M8_END:
.Ldgemm_tcopy_L1_M8_END:


dgemm_tcopy_L999:
.Ldgemm_tcopy_L999:
mov x0, #0 // set return value
RESTORE_REGS
ret


+ 20
- 20
kernel/arm64/dot.S View File

@@ -154,51 +154,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif

cmp N, xzr
ble dot_kernel_L999
ble .Ldot_kernel_L999

cmp INC_X, #1
bne dot_kernel_S_BEGIN
bne .Ldot_kernel_S_BEGIN
cmp INC_Y, #1
bne dot_kernel_S_BEGIN
bne .Ldot_kernel_S_BEGIN

dot_kernel_F_BEGIN:
.Ldot_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq dot_kernel_F1
beq .Ldot_kernel_F1

dot_kernel_F4:
.Ldot_kernel_F4:

KERNEL_F4

subs I, I, #1
bne dot_kernel_F4
bne .Ldot_kernel_F4

KERNEL_F4_FINALIZE

dot_kernel_F1:
.Ldot_kernel_F1:

ands I, N, #3
ble dot_kernel_L999
ble .Ldot_kernel_L999

dot_kernel_F10:
.Ldot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne dot_kernel_F10
bne .Ldot_kernel_F10

ret

dot_kernel_S_BEGIN:
.Ldot_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble dot_kernel_S1
ble .Ldot_kernel_S1

dot_kernel_S4:
.Ldot_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -206,21 +206,21 @@ dot_kernel_S4:
KERNEL_S1

subs I, I, #1
bne dot_kernel_S4
bne .Ldot_kernel_S4

dot_kernel_S1:
.Ldot_kernel_S1:

ands I, N, #3
ble dot_kernel_L999
ble .Ldot_kernel_L999

dot_kernel_S10:
.Ldot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne dot_kernel_S10
bne .Ldot_kernel_S10

dot_kernel_L999:
.Ldot_kernel_L999:

ret



+ 129
- 129
kernel/arm64/dtrmm_kernel_4x4.S View File

@@ -549,11 +549,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN

/******************************************************************************/

dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2

@@ -563,14 +563,14 @@ dtrmm_kernel_L4_BEGIN:

mov pA, origPA // pA = start of A array

dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN

dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@@ -591,57 +591,57 @@ dtrmm_kernel_L4_M4_20:

asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M4_32
blt .Ldtrmm_kernel_L4_M4_32

KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K

subs counterL, counterL, #2
ble dtrmm_kernel_L4_M4_22a
ble .Ldtrmm_kernel_L4_M4_22a
.align 5

dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:

KERNEL4x4_M1
KERNEL4x4_M2

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22


dtrmm_kernel_L4_M4_22a:
.Ldtrmm_kernel_L4_M4_22a:

KERNEL4x4_M1
KERNEL4x4_E

b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44

dtrmm_kernel_L4_M4_32:
.Ldtrmm_kernel_L4_M4_32:

tst counterL, #1
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40

KERNEL4x4_I

KERNEL4x4_E

b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44


dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:

INIT4x4

dtrmm_kernel_L4_M4_44:
.Ldtrmm_kernel_L4_M4_44:

ands counterL , tempK, #1
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100

dtrmm_kernel_L4_M4_46:
.Ldtrmm_kernel_L4_M4_46:

KERNEL4x4_SUB

dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:

SAVE4x4

@@ -660,20 +660,20 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif

dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne dtrmm_kernel_L4_M4_20
bne .Ldtrmm_kernel_L4_M4_20

dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN

dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:

INIT2x4

@@ -697,9 +697,9 @@ dtrmm_kernel_L4_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40

dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -712,22 +712,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22


dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100

dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42

dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:

SAVE2x4

@@ -747,15 +747,15 @@ dtrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif

dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:


dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END

dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:

INIT1x4

@@ -779,9 +779,9 @@ dtrmm_kernel_L4_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40

dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -793,22 +793,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22


dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100

dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42

dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:

SAVE1x4

@@ -828,7 +828,7 @@ dtrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1
#endif

dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:

lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@@ -838,19 +838,19 @@ dtrmm_kernel_L4_END:
#endif

subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L4_BEGIN
bgt .Ldtrmm_kernel_L4_BEGIN


/******************************************************************************/

dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -863,14 +863,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A


dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN

dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:

INIT4x2

@@ -894,10 +894,10 @@ dtrmm_kernel_L2_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5

dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -909,22 +909,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22


dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100

dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42

dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:

SAVE4x2

@@ -944,22 +944,22 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif

dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:

subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M4_20
bgt .Ldtrmm_kernel_L2_M4_20


dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN

dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:

INIT2x2

@@ -983,9 +983,9 @@ dtrmm_kernel_L2_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40

dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -998,22 +998,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22


dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100

dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42

dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:

SAVE2x2

@@ -1033,15 +1033,15 @@ dtrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif

dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:


dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END

dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:

INIT1x2

@@ -1065,9 +1065,9 @@ dtrmm_kernel_L2_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40

dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1079,22 +1079,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22


dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100

dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42

dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:

SAVE1x2

@@ -1114,7 +1114,7 @@ dtrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif

dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@@ -1122,11 +1122,11 @@ dtrmm_kernel_L2_END:

/******************************************************************************/

dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1138,14 +1138,14 @@ dtrmm_kernel_L1_BEGIN:

mov pA, origPA // pA = A

dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN

dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:

INIT4x1

@@ -1169,10 +1169,10 @@ dtrmm_kernel_L1_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5

dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1184,22 +1184,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22


dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100

dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42

dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:

SAVE4x1

@@ -1220,22 +1220,22 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif

dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:

subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M4_20
bgt .Ldtrmm_kernel_L1_M4_20


dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN

dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:

INIT2x1

@@ -1259,9 +1259,9 @@ dtrmm_kernel_L1_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40

dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1274,22 +1274,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22


dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100

dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42

dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:

SAVE2x1

@@ -1309,15 +1309,15 @@ dtrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif

dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:


dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END

dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:

INIT1x1

@@ -1341,9 +1341,9 @@ dtrmm_kernel_L1_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40

dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1355,30 +1355,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22


dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100

dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42

dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:

SAVE1x1


dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:


dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 176
- 176
kernel/arm64/dtrmm_kernel_4x8.S View File

@@ -900,11 +900,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble dtrmm_kernel_L4_BEGIN
ble .Ldtrmm_kernel_L4_BEGIN

/******************************************************************************/

dtrmm_kernel_L8_BEGIN:
.Ldtrmm_kernel_L8_BEGIN:

mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3
@@ -915,14 +915,14 @@ dtrmm_kernel_L8_BEGIN:

mov pA, origPA // pA = start of A array

dtrmm_kernel_L8_M4_BEGIN:
.Ldtrmm_kernel_L8_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L8_M2_BEGIN
ble .Ldtrmm_kernel_L8_M2_BEGIN

dtrmm_kernel_L8_M4_20:
.Ldtrmm_kernel_L8_M4_20:

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@@ -944,57 +944,57 @@ dtrmm_kernel_L8_M4_20:

asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L8_M4_32
blt .Ldtrmm_kernel_L8_M4_32

KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K

subs counterL, counterL, #2
ble dtrmm_kernel_L8_M4_22a
ble .Ldtrmm_kernel_L8_M4_22a
.align 5

dtrmm_kernel_L8_M4_22:
.Ldtrmm_kernel_L8_M4_22:

KERNEL4x8_M1
KERNEL4x8_M2

subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M4_22
bgt .Ldtrmm_kernel_L8_M4_22


dtrmm_kernel_L8_M4_22a:
.Ldtrmm_kernel_L8_M4_22a:

KERNEL4x8_M1
KERNEL4x8_E

b dtrmm_kernel_L8_M4_44
b .Ldtrmm_kernel_L8_M4_44

dtrmm_kernel_L8_M4_32:
.Ldtrmm_kernel_L8_M4_32:

tst counterL, #1
ble dtrmm_kernel_L8_M4_40
ble .Ldtrmm_kernel_L8_M4_40

KERNEL4x8_I

KERNEL4x8_E

b dtrmm_kernel_L8_M4_44
b .Ldtrmm_kernel_L8_M4_44


dtrmm_kernel_L8_M4_40:
.Ldtrmm_kernel_L8_M4_40:

INIT4x8

dtrmm_kernel_L8_M4_44:
.Ldtrmm_kernel_L8_M4_44:

ands counterL, tempK, #1
ble dtrmm_kernel_L8_M4_100
ble .Ldtrmm_kernel_L8_M4_100

dtrmm_kernel_L8_M4_46:
.Ldtrmm_kernel_L8_M4_46:

KERNEL4x8_SUB

dtrmm_kernel_L8_M4_100:
.Ldtrmm_kernel_L8_M4_100:

SAVE4x8

@@ -1014,20 +1014,20 @@ dtrmm_kernel_L8_M4_100:
add tempOffset, tempOffset, #4
#endif

dtrmm_kernel_L8_M4_END:
.Ldtrmm_kernel_L8_M4_END:
subs counterI, counterI, #1
bne dtrmm_kernel_L8_M4_20
bne .Ldtrmm_kernel_L8_M4_20

dtrmm_kernel_L8_M2_BEGIN:
.Ldtrmm_kernel_L8_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L8_END
ble .Ldtrmm_kernel_L8_END

tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L8_M1_BEGIN
ble .Ldtrmm_kernel_L8_M1_BEGIN

dtrmm_kernel_L8_M2_20:
.Ldtrmm_kernel_L8_M2_20:

INIT2x8

@@ -1051,9 +1051,9 @@ dtrmm_kernel_L8_M2_20:

asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L8_M2_40
ble .Ldtrmm_kernel_L8_M2_40

dtrmm_kernel_L8_M2_22:
.Ldtrmm_kernel_L8_M2_22:

KERNEL2x8_SUB
KERNEL2x8_SUB
@@ -1066,22 +1066,22 @@ dtrmm_kernel_L8_M2_22:
KERNEL2x8_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M2_22
bgt .Ldtrmm_kernel_L8_M2_22


dtrmm_kernel_L8_M2_40:
.Ldtrmm_kernel_L8_M2_40:

ands counterL, tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L8_M2_100
ble .Ldtrmm_kernel_L8_M2_100

dtrmm_kernel_L8_M2_42:
.Ldtrmm_kernel_L8_M2_42:

KERNEL2x8_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M2_42
bgt .Ldtrmm_kernel_L8_M2_42

dtrmm_kernel_L8_M2_100:
.Ldtrmm_kernel_L8_M2_100:

SAVE2x8

@@ -1102,15 +1102,15 @@ dtrmm_kernel_L8_M2_100:
add tempOffset, tempOffset, #2
#endif

dtrmm_kernel_L8_M2_END:
.Ldtrmm_kernel_L8_M2_END:


dtrmm_kernel_L8_M1_BEGIN:
.Ldtrmm_kernel_L8_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L8_END
ble .Ldtrmm_kernel_L8_END

dtrmm_kernel_L8_M1_20:
.Ldtrmm_kernel_L8_M1_20:

INIT1x8

@@ -1134,9 +1134,9 @@ dtrmm_kernel_L8_M1_20:

asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L8_M1_40
ble .Ldtrmm_kernel_L8_M1_40

dtrmm_kernel_L8_M1_22:
.Ldtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
@@ -1148,22 +1148,22 @@ dtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M1_22
bgt .Ldtrmm_kernel_L8_M1_22


dtrmm_kernel_L8_M1_40:
.Ldtrmm_kernel_L8_M1_40:

ands counterL, tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L8_M1_100
ble .Ldtrmm_kernel_L8_M1_100

dtrmm_kernel_L8_M1_42:
.Ldtrmm_kernel_L8_M1_42:

KERNEL1x8_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M1_42
bgt .Ldtrmm_kernel_L8_M1_42

dtrmm_kernel_L8_M1_100:
.Ldtrmm_kernel_L8_M1_100:

SAVE1x8

@@ -1183,7 +1183,7 @@ dtrmm_kernel_L8_M1_100:
add tempOffset, tempOffset, #1
#endif

dtrmm_kernel_L8_END:
.Ldtrmm_kernel_L8_END:

lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8
@@ -1193,19 +1193,19 @@ dtrmm_kernel_L8_END:
#endif

subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L8_BEGIN
bgt .Ldtrmm_kernel_L8_BEGIN


/******************************************************************************/

dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:

mov counterJ , origN
tst counterJ , #7
ble dtrmm_kernel_L999
ble .Ldtrmm_kernel_L999

tst counterJ , #4
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN

mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
@@ -1216,14 +1216,14 @@ dtrmm_kernel_L4_BEGIN:

mov pA, origPA // pA = start of A array

dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN

dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@@ -1244,57 +1244,57 @@ dtrmm_kernel_L4_M4_20:

asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M4_32
blt .Ldtrmm_kernel_L4_M4_32

KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K

subs counterL, counterL, #2
ble dtrmm_kernel_L4_M4_22a
ble .Ldtrmm_kernel_L4_M4_22a
.align 5

dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:

KERNEL4x4_M1
KERNEL4x4_M2

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22


dtrmm_kernel_L4_M4_22a:
.Ldtrmm_kernel_L4_M4_22a:

KERNEL4x4_M1
KERNEL4x4_E

b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44

dtrmm_kernel_L4_M4_32:
.Ldtrmm_kernel_L4_M4_32:

tst counterL, #1
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40

KERNEL4x4_I

KERNEL4x4_E

b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44


dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:

INIT4x4

dtrmm_kernel_L4_M4_44:
.Ldtrmm_kernel_L4_M4_44:

ands counterL , tempK, #1
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100

dtrmm_kernel_L4_M4_46:
.Ldtrmm_kernel_L4_M4_46:

KERNEL4x4_SUB

dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:

SAVE4x4
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1312,20 +1312,20 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif

dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne dtrmm_kernel_L4_M4_20
bne .Ldtrmm_kernel_L4_M4_20

dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN

dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:

INIT2x4

@@ -1348,9 +1348,9 @@ dtrmm_kernel_L4_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40

dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -1363,22 +1363,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22


dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100

dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42

dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:

SAVE2x4

@@ -1397,15 +1397,15 @@ dtrmm_kernel_L4_M2_100:
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:


dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END

dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:

INIT1x4

@@ -1428,9 +1428,9 @@ dtrmm_kernel_L4_M1_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40

dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1442,22 +1442,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22


dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100

dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42

dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:

SAVE1x4

@@ -1476,7 +1476,7 @@ dtrmm_kernel_L4_M1_100:
#if defined(LEFT)
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:

lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@@ -1486,14 +1486,14 @@ dtrmm_kernel_L4_END:

/******************************************************************************/

dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1505,14 +1505,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A


dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN

dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:

INIT4x2

@@ -1535,10 +1535,10 @@ dtrmm_kernel_L2_M4_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5

dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1550,22 +1550,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22


dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100

dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42

dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:

SAVE4x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1584,22 +1584,22 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif

dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:

subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M4_20
bgt .Ldtrmm_kernel_L2_M4_20


dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN

dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:

INIT2x2

@@ -1622,9 +1622,9 @@ dtrmm_kernel_L2_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40

dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1637,22 +1637,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22


dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100

dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42

dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:

SAVE2x2

@@ -1671,15 +1671,15 @@ dtrmm_kernel_L2_M2_100:
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:


dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END

dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:

INIT1x2

@@ -1702,9 +1702,9 @@ dtrmm_kernel_L2_M1_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40

dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1716,22 +1716,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22


dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100

dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42

dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:

SAVE1x2

@@ -1750,7 +1750,7 @@ dtrmm_kernel_L2_M1_100:
#if defined(LEFT)
add tempOffset, tempOffset, #1
#endif
dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@@ -1758,11 +1758,11 @@ dtrmm_kernel_L2_END:

/******************************************************************************/

dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1773,14 +1773,14 @@ dtrmm_kernel_L1_BEGIN:
#endif
mov pA, origPA // pA = A

dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN

dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:

INIT4x1

@@ -1802,10 +1802,10 @@ dtrmm_kernel_L1_M4_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5

dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1817,22 +1817,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22


dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100

dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42

dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:

SAVE4x1
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1851,22 +1851,22 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif

dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:

subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M4_20
bgt .Ldtrmm_kernel_L1_M4_20


dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN

dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:

INIT2x1

@@ -1889,9 +1889,9 @@ dtrmm_kernel_L1_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40

dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1904,22 +1904,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22


dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100

dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42

dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:

SAVE2x1

@@ -1938,15 +1938,15 @@ dtrmm_kernel_L1_M2_100:
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:


dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END

dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:

INIT1x1

@@ -1969,9 +1969,9 @@ dtrmm_kernel_L1_M1_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40

dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1983,30 +1983,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22


dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100

dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42

dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:

SAVE1x1


dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:


dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 169
- 169
kernel/arm64/dtrmm_kernel_8x4.S View File

@@ -829,11 +829,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN

/******************************************************************************/

dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@@ -847,15 +847,15 @@ dtrmm_kernel_L4_BEGIN:
#endif
mov pA, origPA // pA = start of A array

dtrmm_kernel_L4_M8_BEGIN:
.Ldtrmm_kernel_L4_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dtrmm_kernel_L4_M4_BEGIN
ble .Ldtrmm_kernel_L4_M4_BEGIN

.align 5
dtrmm_kernel_L4_M8_20:
.Ldtrmm_kernel_L4_M8_20:

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@@ -877,7 +877,7 @@ dtrmm_kernel_L4_M8_20:

asr counterL , tempK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M8_32
blt .Ldtrmm_kernel_L4_M8_32

KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
@@ -889,10 +889,10 @@ dtrmm_kernel_L4_M8_20:
KERNEL8x4_M2

subs counterL, counterL, #2 // subtract 2
ble dtrmm_kernel_L4_M8_22a
ble .Ldtrmm_kernel_L4_M8_22a

.align 5
dtrmm_kernel_L4_M8_22:
.Ldtrmm_kernel_L4_M8_22:

KERNEL8x4_M1
KERNEL8x4_M2
@@ -904,10 +904,10 @@ dtrmm_kernel_L4_M8_22:
KERNEL8x4_M2

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M8_22
bgt .Ldtrmm_kernel_L4_M8_22

.align 5
dtrmm_kernel_L4_M8_22a:
.Ldtrmm_kernel_L4_M8_22a:

KERNEL8x4_M1
KERNEL8x4_M2
@@ -918,13 +918,13 @@ dtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E

b dtrmm_kernel_L4_M8_44
b .Ldtrmm_kernel_L4_M8_44

.align 5
dtrmm_kernel_L4_M8_32:
.Ldtrmm_kernel_L4_M8_32:

tst counterL, #1
ble dtrmm_kernel_L4_M8_40
ble .Ldtrmm_kernel_L4_M8_40

KERNEL8x4_I
KERNEL8x4_M2
@@ -935,26 +935,26 @@ dtrmm_kernel_L4_M8_32:
KERNEL8x4_M1
KERNEL8x4_E

b dtrmm_kernel_L4_M8_44
b .Ldtrmm_kernel_L4_M8_44

dtrmm_kernel_L4_M8_40:
.Ldtrmm_kernel_L4_M8_40:

INIT8x4

dtrmm_kernel_L4_M8_44:
.Ldtrmm_kernel_L4_M8_44:

ands counterL , tempK, #7
ble dtrmm_kernel_L4_M8_100
ble .Ldtrmm_kernel_L4_M8_100

.align 5
dtrmm_kernel_L4_M8_46:
.Ldtrmm_kernel_L4_M8_46:

KERNEL8x4_SUB

subs counterL, counterL, #1
bne dtrmm_kernel_L4_M8_46
bne .Ldtrmm_kernel_L4_M8_46

dtrmm_kernel_L4_M8_100:
.Ldtrmm_kernel_L4_M8_100:

SAVE8x4

@@ -977,20 +977,20 @@ dtrmm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]

dtrmm_kernel_L4_M8_END:
.Ldtrmm_kernel_L4_M8_END:
subs counterI, counterI, #1
bne dtrmm_kernel_L4_M8_20
bne .Ldtrmm_kernel_L4_M8_20

dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END

tst counterI, #4
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN

dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:

INIT4x4

@@ -1013,9 +1013,9 @@ dtrmm_kernel_L4_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40

dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:

KERNEL4x4_SUB
KERNEL4x4_SUB
@@ -1028,22 +1028,22 @@ dtrmm_kernel_L4_M4_22:
KERNEL4x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22


dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100

dtrmm_kernel_L4_M4_42:
.Ldtrmm_kernel_L4_M4_42:

KERNEL4x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_42
bgt .Ldtrmm_kernel_L4_M4_42

dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:

SAVE4x4

@@ -1062,19 +1062,19 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif

dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:


dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN

dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:

INIT2x4

@@ -1097,9 +1097,9 @@ dtrmm_kernel_L4_M2_20:
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40

dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -1112,22 +1112,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22


dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100

dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42

dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:

SAVE2x4

@@ -1147,15 +1147,15 @@ dtrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif

dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:


dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END

dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:

INIT1x4

@@ -1179,9 +1179,9 @@ dtrmm_kernel_L4_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40

dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1193,22 +1193,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22


dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100

dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42

dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:

SAVE1x4

@@ -1228,7 +1228,7 @@ dtrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1
#endif

dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:

lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
@@ -1238,19 +1238,19 @@ dtrmm_kernel_L4_END:
#endif

subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L4_BEGIN
bgt .Ldtrmm_kernel_L4_BEGIN


/******************************************************************************/

dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1261,14 +1261,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
#endif
mov pA, origPA // pA = A

dtrmm_kernel_L2_M8_BEGIN:
.Ldtrmm_kernel_L2_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dtrmm_kernel_L2_M4_BEGIN
ble .Ldtrmm_kernel_L2_M4_BEGIN

dtrmm_kernel_L2_M8_20:
.Ldtrmm_kernel_L2_M8_20:

INIT8x2

@@ -1292,10 +1292,10 @@ dtrmm_kernel_L2_M8_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M8_40
ble .Ldtrmm_kernel_L2_M8_40
.align 5

dtrmm_kernel_L2_M8_22:
.Ldtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
@@ -1307,22 +1307,22 @@ dtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M8_22
bgt .Ldtrmm_kernel_L2_M8_22


dtrmm_kernel_L2_M8_40:
.Ldtrmm_kernel_L2_M8_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M8_100
ble .Ldtrmm_kernel_L2_M8_100

dtrmm_kernel_L2_M8_42:
.Ldtrmm_kernel_L2_M8_42:

KERNEL8x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M8_42
bgt .Ldtrmm_kernel_L2_M8_42

dtrmm_kernel_L2_M8_100:
.Ldtrmm_kernel_L2_M8_100:

SAVE8x2

@@ -1342,21 +1342,21 @@ dtrmm_kernel_L2_M8_100:
add tempOffset, tempOffset, #8
#endif

dtrmm_kernel_L2_M8_END:
.Ldtrmm_kernel_L2_M8_END:

subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M8_20
bgt .Ldtrmm_kernel_L2_M8_20

dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END

tst counterI, #4 // counterI = counterI / 2
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN

dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:

INIT4x2

@@ -1380,10 +1380,10 @@ dtrmm_kernel_L2_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5

dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1395,22 +1395,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22


dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100

dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42

dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:

SAVE4x2

@@ -1430,19 +1430,19 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif

dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:


dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN

dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:

INIT2x2

@@ -1466,9 +1466,9 @@ dtrmm_kernel_L2_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40

dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1481,22 +1481,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22


dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100

dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42

dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:

SAVE2x2

@@ -1516,15 +1516,15 @@ dtrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif

dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:


dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END

dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:

INIT1x2

@@ -1548,9 +1548,9 @@ dtrmm_kernel_L2_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40

dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1562,22 +1562,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22


dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100

dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42

dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:

SAVE1x2

@@ -1597,7 +1597,7 @@ dtrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif

dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@@ -1605,11 +1605,11 @@ dtrmm_kernel_L2_END:

/******************************************************************************/

dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done

mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
@@ -1619,14 +1619,14 @@ dtrmm_kernel_L1_BEGIN:
#endif
mov pA, origPA // pA = A

dtrmm_kernel_L1_M8_BEGIN:
.Ldtrmm_kernel_L1_M8_BEGIN:

mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
ble dtrmm_kernel_L1_M4_BEGIN
ble .Ldtrmm_kernel_L1_M4_BEGIN

dtrmm_kernel_L1_M8_20:
.Ldtrmm_kernel_L1_M8_20:

INIT8x1

@@ -1650,10 +1650,10 @@ dtrmm_kernel_L1_M8_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M8_40
ble .Ldtrmm_kernel_L1_M8_40
.align 5

dtrmm_kernel_L1_M8_22:
.Ldtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
@@ -1665,22 +1665,22 @@ dtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M8_22
bgt .Ldtrmm_kernel_L1_M8_22


dtrmm_kernel_L1_M8_40:
.Ldtrmm_kernel_L1_M8_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M8_100
ble .Ldtrmm_kernel_L1_M8_100

dtrmm_kernel_L1_M8_42:
.Ldtrmm_kernel_L1_M8_42:

KERNEL8x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M8_42
bgt .Ldtrmm_kernel_L1_M8_42

dtrmm_kernel_L1_M8_100:
.Ldtrmm_kernel_L1_M8_100:

SAVE8x1

@@ -1700,21 +1700,21 @@ dtrmm_kernel_L1_M8_100:
add tempOffset, tempOffset, #8
#endif

dtrmm_kernel_L1_M8_END:
.Ldtrmm_kernel_L1_M8_END:

subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M8_20
bgt .Ldtrmm_kernel_L1_M8_20

dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:

mov counterI, origM
tst counterI , #7
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END

tst counterI, #4 // counterI = counterI / 2
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN

dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:

INIT4x1

@@ -1737,10 +1737,10 @@ dtrmm_kernel_L1_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5

dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1752,22 +1752,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22


dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100

dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42

dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:

SAVE4x1

@@ -1787,18 +1787,18 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif

dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:

dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN

dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:

INIT2x1

@@ -1822,9 +1822,9 @@ dtrmm_kernel_L1_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40

dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1837,22 +1837,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22


dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100

dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42

dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:

SAVE2x1

@@ -1872,15 +1872,15 @@ dtrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif

dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:


dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END

dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:

INIT1x1

@@ -1904,9 +1904,9 @@ dtrmm_kernel_L1_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40

dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1918,30 +1918,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22


dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100

dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42

dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:

SAVE1x1


dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:


dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 31
- 31
kernel/arm64/gemv_n.S View File

@@ -203,18 +203,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS

cmp N, xzr
ble gemv_n_kernel_L999
ble .Lgemv_n_kernel_L999
cmp M, xzr
ble gemv_n_kernel_L999
ble .Lgemv_n_kernel_L999

lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ
mov J, N

cmp INC_Y, #1
bne gemv_n_kernel_S_BEGIN
bne .Lgemv_n_kernel_S_BEGIN

gemv_n_kernel_F_LOOP:
.Lgemv_n_kernel_F_LOOP:

ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP
@@ -229,57 +229,57 @@ gemv_n_kernel_F_LOOP:
mov Y_IPTR, Y
mov Y_OPTR, Y

gemv_n_kernel_F32:
.Lgemv_n_kernel_F32:

asr I, M, #5
cmp I, xzr
beq gemv_n_kernel_F4
beq .Lgemv_n_kernel_F4

gemv_n_kernel_F320:
.Lgemv_n_kernel_F320:

KERNEL_F16
KERNEL_F16

subs I, I, #1
bne gemv_n_kernel_F320
bne .Lgemv_n_kernel_F320

gemv_n_kernel_F4:
.Lgemv_n_kernel_F4:
ands I, M, #31
asr I, I, #2
cmp I, xzr
beq gemv_n_kernel_F1
beq .Lgemv_n_kernel_F1

gemv_n_kernel_F40:
.Lgemv_n_kernel_F40:

KERNEL_F4

subs I, I, #1
bne gemv_n_kernel_F40
bne .Lgemv_n_kernel_F40

gemv_n_kernel_F1:
.Lgemv_n_kernel_F1:
ands I, M, #3
ble gemv_n_kernel_F_END
ble .Lgemv_n_kernel_F_END

gemv_n_kernel_F10:
.Lgemv_n_kernel_F10:

KERNEL_F1

subs I, I, #1
bne gemv_n_kernel_F10
bne .Lgemv_n_kernel_F10

gemv_n_kernel_F_END:
.Lgemv_n_kernel_F_END:

add A, A, LDA
subs J, J, #1
bne gemv_n_kernel_F_LOOP
bne .Lgemv_n_kernel_F_LOOP

b gemv_n_kernel_L999
b .Lgemv_n_kernel_L999

gemv_n_kernel_S_BEGIN:
.Lgemv_n_kernel_S_BEGIN:

INIT_S

gemv_n_kernel_S_LOOP:
.Lgemv_n_kernel_S_LOOP:

ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP
@@ -288,9 +288,9 @@ gemv_n_kernel_S_LOOP:

asr I, M, #2
cmp I, xzr
ble gemv_n_kernel_S1
ble .Lgemv_n_kernel_S1

gemv_n_kernel_S4:
.Lgemv_n_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -298,27 +298,27 @@ gemv_n_kernel_S4:
KERNEL_S1

subs I, I, #1
bne gemv_n_kernel_S4
bne .Lgemv_n_kernel_S4

gemv_n_kernel_S1:
.Lgemv_n_kernel_S1:

ands I, M, #3
ble gemv_n_kernel_S_END
ble .Lgemv_n_kernel_S_END

gemv_n_kernel_S10:
.Lgemv_n_kernel_S10:

KERNEL_S1

subs I, I, #1
bne gemv_n_kernel_S10
bne .Lgemv_n_kernel_S10

gemv_n_kernel_S_END:
.Lgemv_n_kernel_S_END:

add A, A, LDA
subs J, J, #1
bne gemv_n_kernel_S_LOOP
bne .Lgemv_n_kernel_S_LOOP

gemv_n_kernel_L999:
.Lgemv_n_kernel_L999:

mov w0, wzr



+ 31
- 31
kernel/arm64/gemv_t.S View File

@@ -233,18 +233,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS

cmp N, xzr
ble gemv_t_kernel_L999
ble .Lgemv_t_kernel_L999
cmp M, xzr
ble gemv_t_kernel_L999
ble .Lgemv_t_kernel_L999

lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ
mov J, N

cmp INC_X, #1
bne gemv_t_kernel_S_BEGIN
bne .Lgemv_t_kernel_S_BEGIN

gemv_t_kernel_F_LOOP:
.Lgemv_t_kernel_F_LOOP:

fmov TEMP, REG0
fmov TEMP1, REG0
@@ -254,64 +254,64 @@ gemv_t_kernel_F_LOOP:
mov A_PTR, A
mov X_PTR, X

gemv_t_kernel_F32:
.Lgemv_t_kernel_F32:

asr I, M, #5
cmp I, xzr
beq gemv_t_kernel_F4
beq .Lgemv_t_kernel_F4

gemv_t_kernel_F320:
.Lgemv_t_kernel_F320:

KERNEL_F32

subs I, I, #1
bne gemv_t_kernel_F320
bne .Lgemv_t_kernel_F320

KERNEL_F32_FINALIZE

gemv_t_kernel_F4:
.Lgemv_t_kernel_F4:
ands I, M, #31
asr I, I, #2
cmp I, xzr
beq gemv_t_kernel_F1
beq .Lgemv_t_kernel_F1

gemv_t_kernel_F40:
.Lgemv_t_kernel_F40:

KERNEL_F4

subs I, I, #1
bne gemv_t_kernel_F40
bne .Lgemv_t_kernel_F40

gemv_t_kernel_F1:
.Lgemv_t_kernel_F1:

KERNEL_F4_FINALIZE

ands I, M, #3
ble gemv_t_kernel_F_END
ble .Lgemv_t_kernel_F_END

gemv_t_kernel_F10:
.Lgemv_t_kernel_F10:

KERNEL_F1

subs I, I, #1
bne gemv_t_kernel_F10
bne .Lgemv_t_kernel_F10

gemv_t_kernel_F_END:
.Lgemv_t_kernel_F_END:

ld1 TMPV1, [Y]
add A, A, LDA
subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_F_LOOP
bne .Lgemv_t_kernel_F_LOOP

b gemv_t_kernel_L999
b .Lgemv_t_kernel_L999

gemv_t_kernel_S_BEGIN:
.Lgemv_t_kernel_S_BEGIN:

INIT_S

gemv_t_kernel_S_LOOP:
.Lgemv_t_kernel_S_LOOP:

fmov TEMP, REG0
mov A_PTR, A
@@ -319,9 +319,9 @@ gemv_t_kernel_S_LOOP:

asr I, M, #2
cmp I, xzr
ble gemv_t_kernel_S1
ble .Lgemv_t_kernel_S1

gemv_t_kernel_S4:
.Lgemv_t_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -329,30 +329,30 @@ gemv_t_kernel_S4:
KERNEL_S1

subs I, I, #1
bne gemv_t_kernel_S4
bne .Lgemv_t_kernel_S4

gemv_t_kernel_S1:
.Lgemv_t_kernel_S1:

ands I, M, #3
ble gemv_t_kernel_S_END
ble .Lgemv_t_kernel_S_END

gemv_t_kernel_S10:
.Lgemv_t_kernel_S10:

KERNEL_S1

subs I, I, #1
bne gemv_t_kernel_S10
bne .Lgemv_t_kernel_S10

gemv_t_kernel_S_END:
.Lgemv_t_kernel_S_END:

ld1 TMPV1, [Y]
add A, A, LDA
subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_S_LOOP
bne .Lgemv_t_kernel_S_LOOP

gemv_t_kernel_L999:
.Lgemv_t_kernel_L999:

RESTORE_REGS



+ 24
- 24
kernel/arm64/iamax.S View File

@@ -230,62 +230,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

cmp N, xzr
ble iamax_kernel_zero
ble .Liamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero
ble .Liamax_kernel_zero

cmp INC_X, #1
bne iamax_kernel_S_BEGIN
bne .Liamax_kernel_S_BEGIN
mov x7, X

iamax_kernel_F_BEGIN:
.Liamax_kernel_F_BEGIN:

INIT_S

subs N, N, #1
ble iamax_kernel_L999
ble .Liamax_kernel_L999

asr I, N, #3
cmp I, xzr
beq iamax_kernel_F1
beq .Liamax_kernel_F1

add Z, Z, #1
iamax_kernel_F8:
.Liamax_kernel_F8:

KERNEL_F8

subs I, I, #1
bne iamax_kernel_F8
bne .Liamax_kernel_F8

KERNEL_F8_FINALIZE

sub Z, Z, #1
iamax_kernel_F1:
.Liamax_kernel_F1:

ands I, N, #7
ble iamax_kernel_L999
ble .Liamax_kernel_L999

iamax_kernel_F10:
.Liamax_kernel_F10:

KERNEL_S1

subs I, I, #1
bne iamax_kernel_F10
bne .Liamax_kernel_F10

b iamax_kernel_L999
b .Liamax_kernel_L999

iamax_kernel_S_BEGIN:
.Liamax_kernel_S_BEGIN:

INIT_S

subs N, N, #1
ble iamax_kernel_L999
ble .Liamax_kernel_L999

asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1
ble .Liamax_kernel_S1

iamax_kernel_S4:
.Liamax_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -293,25 +293,25 @@ iamax_kernel_S4:
KERNEL_S1

subs I, I, #1
bne iamax_kernel_S4
bne .Liamax_kernel_S4

iamax_kernel_S1:
.Liamax_kernel_S1:

ands I, N, #3
ble iamax_kernel_L999
ble .Liamax_kernel_L999

iamax_kernel_S10:
.Liamax_kernel_S10:

KERNEL_S1
subs I, I, #1
bne iamax_kernel_S10
bne .Liamax_kernel_S10

iamax_kernel_L999:
.Liamax_kernel_L999:

mov x0, INDEX
ret

iamax_kernel_zero:
.Liamax_kernel_zero:

mov x0, xzr
ret


+ 24
- 24
kernel/arm64/izamax.S View File

@@ -276,64 +276,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

cmp N, xzr
ble iamax_kernel_zero
ble .Lizamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero
ble .Lizamax_kernel_zero

cmp INC_X, #1
bne iamax_kernel_S_BEGIN
bne .Lizamax_kernel_S_BEGIN
mov x7, X


iamax_kernel_F_BEGIN:
.Lizamax_kernel_F_BEGIN:

INIT_S

subs N, N, #1
ble iamax_kernel_L999
ble .Lizamax_kernel_L999

asr I, N, #3
cmp I, xzr
ble iamax_kernel_F1
ble .Lizamax_kernel_F1

add Z, Z, #1

iamax_kernel_F8:
.Lizamax_kernel_F8:

KERNEL_F8

subs I, I, #1
bne iamax_kernel_F8
bne .Lizamax_kernel_F8

KERNEL_F8_FINALIZE

sub Z, Z, #1
iamax_kernel_F1:
.Lizamax_kernel_F1:

ands I, N, #7
ble iamax_kernel_L999
ble .Lizamax_kernel_L999

iamax_kernel_F10:
.Lizamax_kernel_F10:

KERNEL_S1

subs I, I, #1
bne iamax_kernel_F10
bne .Lizamax_kernel_F10

b iamax_kernel_L999
b .Lizamax_kernel_L999

iamax_kernel_S_BEGIN:
.Lizamax_kernel_S_BEGIN:

INIT_S

subs N, N, #1
ble iamax_kernel_L999
ble .Lizamax_kernel_L999

asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1
ble .Lizamax_kernel_S1

iamax_kernel_S4:
.Lizamax_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -341,26 +341,26 @@ iamax_kernel_S4:
KERNEL_S1

subs I, I, #1
bne iamax_kernel_S4
bne .Lizamax_kernel_S4

iamax_kernel_S1:
.Lizamax_kernel_S1:

ands I, N, #3
ble iamax_kernel_L999
ble .Lizamax_kernel_L999

iamax_kernel_S10:
.Lizamax_kernel_S10:

KERNEL_S1

subs I, I, #1
bne iamax_kernel_S10
bne .Lizamax_kernel_S10

iamax_kernel_L999:
.Lizamax_kernel_L999:

mov x0, INDEX
ret

iamax_kernel_zero:
.Lizamax_kernel_zero:

mov x0, xzr
ret


+ 16
- 16
kernel/arm64/nrm2.S View File

@@ -162,44 +162,44 @@ KERNEL_S1_NEXT:
INIT

cmp N, #0
ble nrm2_kernel_L999
ble .Lnrm2_kernel_L999

cmp INC_X, #0
beq nrm2_kernel_L999
beq .Lnrm2_kernel_L999


cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
bne .Lnrm2_kernel_S_BEGIN

nrm2_kernel_F_BEGIN:
.Lnrm2_kernel_F_BEGIN:

asr I, N, #3 // I = N / 8
cmp I, xzr
ble nrm2_kernel_F1
ble .Lnrm2_kernel_F1

nrm2_kernel_F8:
.Lnrm2_kernel_F8:

KERNEL_F8

subs I, I, #1
bne nrm2_kernel_F8
bne .Lnrm2_kernel_F8

nrm2_kernel_F1:
.Lnrm2_kernel_F1:

ands I, N, #7
ble nrm2_kernel_L999
ble .Lnrm2_kernel_L999


nrm2_kernel_F10:
.Lnrm2_kernel_F10:

KERNEL_F1

subs I, I, #1
bne nrm2_kernel_F10
bne .Lnrm2_kernel_F10

b nrm2_kernel_L999
b .Lnrm2_kernel_L999

nrm2_kernel_S_BEGIN:
.Lnrm2_kernel_S_BEGIN:

INIT_S

@@ -207,15 +207,15 @@ nrm2_kernel_S_BEGIN:

.align 5

nrm2_kernel_S10:
.Lnrm2_kernel_S10:

KERNEL_S1

subs I, I, #1
bne nrm2_kernel_S10
bne .Lnrm2_kernel_S10


nrm2_kernel_L999:
.Lnrm2_kernel_L999:
fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ



+ 20
- 20
kernel/arm64/rot.S View File

@@ -165,48 +165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

cmp N, xzr
ble rot_kernel_L999
ble .Lrot_kernel_L999

INIT

cmp INC_X, #1
bne rot_kernel_S_BEGIN
bne .Lrot_kernel_S_BEGIN
cmp INC_Y, #1
bne rot_kernel_S_BEGIN
bne .Lrot_kernel_S_BEGIN

rot_kernel_F_BEGIN:
.Lrot_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq rot_kernel_F1
beq .Lrot_kernel_F1

KERNEL_INIT_F4

rot_kernel_F4:
.Lrot_kernel_F4:

KERNEL_F4

subs I, I, #1
bne rot_kernel_F4
bne .Lrot_kernel_F4

rot_kernel_F1:
.Lrot_kernel_F1:

ands I, N, #3
ble rot_kernel_L999
ble .Lrot_kernel_L999

INIT_F1

rot_kernel_F10:
.Lrot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne rot_kernel_F10
bne .Lrot_kernel_F10

mov w0, wzr
ret

rot_kernel_S_BEGIN:
.Lrot_kernel_S_BEGIN:

INIT_S
INIT_F1
@@ -214,9 +214,9 @@ rot_kernel_S_BEGIN:

asr I, N, #2
cmp I, xzr
ble rot_kernel_S1
ble .Lrot_kernel_S1

rot_kernel_S4:
.Lrot_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -224,22 +224,22 @@ rot_kernel_S4:
KERNEL_S1

subs I, I, #1
bne rot_kernel_S4
bne .Lrot_kernel_S4

rot_kernel_S1:
.Lrot_kernel_S1:

ands I, N, #3
ble rot_kernel_L999
ble .Lrot_kernel_L999


rot_kernel_S10:
.Lrot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne rot_kernel_S10
bne .Lrot_kernel_S10

rot_kernel_L999:
.Lrot_kernel_L999:

mov w0, wzr
ret

+ 23
- 23
kernel/arm64/scal.S View File

@@ -166,86 +166,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

cmp N, xzr
ble scal_kernel_L999
ble .Lscal_kernel_L999

fcmp DA, #0.0
beq scal_kernel_zero
beq .Lscal_kernel_zero

cmp INC_X, #1
bne scal_kernel_S_BEGIN
bne .Lscal_kernel_S_BEGIN

scal_kernel_F_BEGIN:
.Lscal_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq scal_kernel_F1
beq .Lscal_kernel_F1

KERNEL_INIT_F8

scal_kernel_F8:
.Lscal_kernel_F8:

KERNEL_F8

subs I, I, #1
bne scal_kernel_F8
bne .Lscal_kernel_F8

scal_kernel_F1:
.Lscal_kernel_F1:

ands I, N, #7
ble scal_kernel_L999
ble .Lscal_kernel_L999

scal_kernel_F10:
.Lscal_kernel_F10:

KERNEL_F1

subs I, I, #1
bne scal_kernel_F10
bne .Lscal_kernel_F10

mov w0, wzr
ret

scal_kernel_S_BEGIN:
.Lscal_kernel_S_BEGIN:

INIT_S
mov X_COPY, X

asr I, N, #2
cmp I, xzr
ble scal_kernel_S1
ble .Lscal_kernel_S1

scal_kernel_S4:
.Lscal_kernel_S4:

KERNEL_S4

subs I, I, #1
bne scal_kernel_S4
bne .Lscal_kernel_S4

scal_kernel_S1:
.Lscal_kernel_S1:

ands I, N, #3
ble scal_kernel_L999
ble .Lscal_kernel_L999

scal_kernel_S10:
.Lscal_kernel_S10:

KERNEL_S1

subs I, I, #1
bne scal_kernel_S10
bne .Lscal_kernel_S10

scal_kernel_L999:
.Lscal_kernel_L999:

mov w0, wzr
ret

scal_kernel_zero:
.Lscal_kernel_zero:

INIT_S

scal_kernel_Z1:
.Lscal_kernel_Z1:

st1 DAV, [X], INC_X
subs N, N, #1
bne scal_kernel_Z1
bne .Lscal_kernel_Z1

mov w0, wzr
ret


+ 221
- 221
kernel/arm64/sgemm_kernel_16x4.S
File diff suppressed because it is too large
View File


+ 221
- 221
kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S
File diff suppressed because it is too large
View File


+ 155
- 155
kernel/arm64/sgemm_kernel_4x4.S View File

@@ -892,11 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble sgemm_kernel_L2_BEGIN
ble .Lsgemm_kernel_L2_BEGIN

/******************************************************************************/

sgemm_kernel_L4_BEGIN:
.Lsgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2

@@ -906,73 +906,73 @@ sgemm_kernel_L4_BEGIN:
add pA_2, temp, pA_1
add pA_3, temp, pA_2

sgemm_kernel_L4_M16_BEGIN:
.Lsgemm_kernel_L4_M16_BEGIN:

mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0
ble sgemm_kernel_L4_M8_BEGIN
ble .Lsgemm_kernel_L4_M8_BEGIN

sgemm_kernel_L4_M16_20:
.Lsgemm_kernel_L4_M16_20:

mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt sgemm_kernel_L4_M16_32
blt .Lsgemm_kernel_L4_M16_32

KERNEL16x4_I // do one in the K
KERNEL16x4_M2 // do another in the K

subs counterL, counterL, #2
ble sgemm_kernel_L4_M16_22a
ble .Lsgemm_kernel_L4_M16_22a
.align 5

sgemm_kernel_L4_M16_22:
.Lsgemm_kernel_L4_M16_22:

KERNEL16x4_M1
KERNEL16x4_M2

subs counterL, counterL, #1
bgt sgemm_kernel_L4_M16_22
bgt .Lsgemm_kernel_L4_M16_22


sgemm_kernel_L4_M16_22a:
.Lsgemm_kernel_L4_M16_22a:

KERNEL16x4_M1
KERNEL16x4_E

b sgemm_kernel_L4_M16_44
b .Lsgemm_kernel_L4_M16_44

sgemm_kernel_L4_M16_32:
.Lsgemm_kernel_L4_M16_32:

tst counterL, #1
ble sgemm_kernel_L4_M16_40
ble .Lsgemm_kernel_L4_M16_40

KERNEL16x4_I

KERNEL16x4_E

b sgemm_kernel_L4_M16_44
b .Lsgemm_kernel_L4_M16_44


sgemm_kernel_L4_M16_40:
.Lsgemm_kernel_L4_M16_40:

INIT16x4

sgemm_kernel_L4_M16_44:
.Lsgemm_kernel_L4_M16_44:

ands counterL , origK, #1
ble sgemm_kernel_L4_M16_100
ble .Lsgemm_kernel_L4_M16_100

sgemm_kernel_L4_M16_46:
.Lsgemm_kernel_L4_M16_46:

KERNEL16x4_SUB

sgemm_kernel_L4_M16_100:
.Lsgemm_kernel_L4_M16_100:

SAVE16x4

sgemm_kernel_L4_M16_END:
.Lsgemm_kernel_L4_M16_END:
lsl temp, origK, #4 // k * 4 * 4 = Four rows of A
add pA_0, pA_0, temp
add pA_0, pA_0, temp
@@ -981,26 +981,26 @@ sgemm_kernel_L4_M16_END:
add pA_2, pA_1, temp
add pA_3, pA_2, temp
subs counterI, counterI, #1
bne sgemm_kernel_L4_M16_20
bne .Lsgemm_kernel_L4_M16_20

sgemm_kernel_L4_M8_BEGIN:
.Lsgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
tst counterI , #15
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END

tst counterI, #8
ble sgemm_kernel_L4_M4_BEGIN
ble .Lsgemm_kernel_L4_M4_BEGIN

sgemm_kernel_L4_M8_20:
.Lsgemm_kernel_L4_M8_20:

INIT8x4

mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble sgemm_kernel_L4_M8_40
ble .Lsgemm_kernel_L4_M8_40

sgemm_kernel_L4_M8_22:
.Lsgemm_kernel_L4_M8_22:

KERNEL8x4_SUB
KERNEL8x4_SUB
@@ -1013,47 +1013,47 @@ sgemm_kernel_L4_M8_22:
KERNEL8x4_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L4_M8_22
bgt .Lsgemm_kernel_L4_M8_22


sgemm_kernel_L4_M8_40:
.Lsgemm_kernel_L4_M8_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M8_100
ble .Lsgemm_kernel_L4_M8_100

sgemm_kernel_L4_M8_42:
.Lsgemm_kernel_L4_M8_42:

KERNEL8x4_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L4_M8_42
bgt .Lsgemm_kernel_L4_M8_42

sgemm_kernel_L4_M8_100:
.Lsgemm_kernel_L4_M8_100:

SAVE8x4

sgemm_kernel_L4_M8_END:
.Lsgemm_kernel_L4_M8_END:
lsl temp, origK, #4 // k * 4 * 4
add pA_0, pA_0, temp

sgemm_kernel_L4_M4_BEGIN:
.Lsgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END

tst counterI, #4
ble sgemm_kernel_L4_M2_BEGIN
ble .Lsgemm_kernel_L4_M2_BEGIN

sgemm_kernel_L4_M4_20:
.Lsgemm_kernel_L4_M4_20:

INIT4x4

mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble sgemm_kernel_L4_M4_40
ble .Lsgemm_kernel_L4_M4_40

sgemm_kernel_L4_M4_22:
.Lsgemm_kernel_L4_M4_22:

KERNEL4x4_SUB
KERNEL4x4_SUB
@@ -1066,47 +1066,47 @@ sgemm_kernel_L4_M4_22:
KERNEL4x4_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L4_M4_22
bgt .Lsgemm_kernel_L4_M4_22


sgemm_kernel_L4_M4_40:
.Lsgemm_kernel_L4_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M4_100
ble .Lsgemm_kernel_L4_M4_100

sgemm_kernel_L4_M4_42:
.Lsgemm_kernel_L4_M4_42:

KERNEL4x4_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L4_M4_42
bgt .Lsgemm_kernel_L4_M4_42

sgemm_kernel_L4_M4_100:
.Lsgemm_kernel_L4_M4_100:

SAVE4x4

sgemm_kernel_L4_M4_END:
.Lsgemm_kernel_L4_M4_END:


sgemm_kernel_L4_M2_BEGIN:
.Lsgemm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L4_M1_BEGIN
ble .Lsgemm_kernel_L4_M1_BEGIN

sgemm_kernel_L4_M2_20:
.Lsgemm_kernel_L4_M2_20:

INIT2x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L4_M2_40
ble .Lsgemm_kernel_L4_M2_40

sgemm_kernel_L4_M2_22:
.Lsgemm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -1119,43 +1119,43 @@ sgemm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L4_M2_22
bgt .Lsgemm_kernel_L4_M2_22


sgemm_kernel_L4_M2_40:
.Lsgemm_kernel_L4_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M2_100
ble .Lsgemm_kernel_L4_M2_100

sgemm_kernel_L4_M2_42:
.Lsgemm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L4_M2_42
bgt .Lsgemm_kernel_L4_M2_42

sgemm_kernel_L4_M2_100:
.Lsgemm_kernel_L4_M2_100:

SAVE2x4

sgemm_kernel_L4_M2_END:
.Lsgemm_kernel_L4_M2_END:


sgemm_kernel_L4_M1_BEGIN:
.Lsgemm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END

sgemm_kernel_L4_M1_20:
.Lsgemm_kernel_L4_M1_20:

INIT1x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L4_M1_40
ble .Lsgemm_kernel_L4_M1_40

sgemm_kernel_L4_M1_22:
.Lsgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1167,45 +1167,45 @@ sgemm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L4_M1_22
bgt .Lsgemm_kernel_L4_M1_22


sgemm_kernel_L4_M1_40:
.Lsgemm_kernel_L4_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M1_100
ble .Lsgemm_kernel_L4_M1_100

sgemm_kernel_L4_M1_42:
.Lsgemm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L4_M1_42
bgt .Lsgemm_kernel_L4_M1_42

sgemm_kernel_L4_M1_100:
.Lsgemm_kernel_L4_M1_100:

SAVE1x4


sgemm_kernel_L4_END:
.Lsgemm_kernel_L4_END:

lsl temp, origK, #4
add origPB, origPB, temp // B = B + K * 4 * 4

subs counterJ, counterJ , #1 // j--
bgt sgemm_kernel_L4_BEGIN
bgt .Lsgemm_kernel_L4_BEGIN


/******************************************************************************/

sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble sgemm_kernel_L999
ble .Lsgemm_kernel_L999

tst counterJ , #2
ble sgemm_kernel_L1_BEGIN
ble .Lsgemm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1215,24 +1215,24 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction



sgemm_kernel_L2_M4_BEGIN:
.Lsgemm_kernel_L2_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble sgemm_kernel_L2_M2_BEGIN
ble .Lsgemm_kernel_L2_M2_BEGIN

sgemm_kernel_L2_M4_20:
.Lsgemm_kernel_L2_M4_20:

INIT4x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble sgemm_kernel_L2_M4_40
ble .Lsgemm_kernel_L2_M4_40
.align 5

sgemm_kernel_L2_M4_22:
.Lsgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1244,50 +1244,50 @@ sgemm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L2_M4_22
bgt .Lsgemm_kernel_L2_M4_22


sgemm_kernel_L2_M4_40:
.Lsgemm_kernel_L2_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M4_100
ble .Lsgemm_kernel_L2_M4_100

sgemm_kernel_L2_M4_42:
.Lsgemm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L2_M4_42
bgt .Lsgemm_kernel_L2_M4_42

sgemm_kernel_L2_M4_100:
.Lsgemm_kernel_L2_M4_100:

SAVE4x2

sgemm_kernel_L2_M4_END:
.Lsgemm_kernel_L2_M4_END:

subs counterI, counterI, #1
bgt sgemm_kernel_L2_M4_20
bgt .Lsgemm_kernel_L2_M4_20


sgemm_kernel_L2_M2_BEGIN:
.Lsgemm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble sgemm_kernel_L2_END
ble .Lsgemm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L2_M1_BEGIN
ble .Lsgemm_kernel_L2_M1_BEGIN

sgemm_kernel_L2_M2_20:
.Lsgemm_kernel_L2_M2_20:

INIT2x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble sgemm_kernel_L2_M2_40
ble .Lsgemm_kernel_L2_M2_40

sgemm_kernel_L2_M2_22:
.Lsgemm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1300,43 +1300,43 @@ sgemm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L2_M2_22
bgt .Lsgemm_kernel_L2_M2_22


sgemm_kernel_L2_M2_40:
.Lsgemm_kernel_L2_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M2_100
ble .Lsgemm_kernel_L2_M2_100

sgemm_kernel_L2_M2_42:
.Lsgemm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L2_M2_42
bgt .Lsgemm_kernel_L2_M2_42

sgemm_kernel_L2_M2_100:
.Lsgemm_kernel_L2_M2_100:

SAVE2x2

sgemm_kernel_L2_M2_END:
.Lsgemm_kernel_L2_M2_END:


sgemm_kernel_L2_M1_BEGIN:
.Lsgemm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L2_END
ble .Lsgemm_kernel_L2_END

sgemm_kernel_L2_M1_20:
.Lsgemm_kernel_L2_M1_20:

INIT1x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble sgemm_kernel_L2_M1_40
ble .Lsgemm_kernel_L2_M1_40

sgemm_kernel_L2_M1_22:
.Lsgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1348,36 +1348,36 @@ sgemm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L2_M1_22
bgt .Lsgemm_kernel_L2_M1_22


sgemm_kernel_L2_M1_40:
.Lsgemm_kernel_L2_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M1_100
ble .Lsgemm_kernel_L2_M1_100

sgemm_kernel_L2_M1_42:
.Lsgemm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L2_M1_42
bgt .Lsgemm_kernel_L2_M1_42

sgemm_kernel_L2_M1_100:
.Lsgemm_kernel_L2_M1_100:

SAVE1x2


sgemm_kernel_L2_END:
.Lsgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4

/******************************************************************************/

sgemm_kernel_L1_BEGIN:
.Lsgemm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble sgemm_kernel_L999 // done
ble .Lsgemm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1387,24 +1387,24 @@ sgemm_kernel_L1_BEGIN:



sgemm_kernel_L1_M4_BEGIN:
.Lsgemm_kernel_L1_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble sgemm_kernel_L1_M2_BEGIN
ble .Lsgemm_kernel_L1_M2_BEGIN

sgemm_kernel_L1_M4_20:
.Lsgemm_kernel_L1_M4_20:

INIT4x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L1_M4_40
ble .Lsgemm_kernel_L1_M4_40
.align 5

sgemm_kernel_L1_M4_22:
.Lsgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1416,50 +1416,50 @@ sgemm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L1_M4_22
bgt .Lsgemm_kernel_L1_M4_22


sgemm_kernel_L1_M4_40:
.Lsgemm_kernel_L1_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M4_100
ble .Lsgemm_kernel_L1_M4_100

sgemm_kernel_L1_M4_42:
.Lsgemm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L1_M4_42
bgt .Lsgemm_kernel_L1_M4_42

sgemm_kernel_L1_M4_100:
.Lsgemm_kernel_L1_M4_100:

SAVE4x1

sgemm_kernel_L1_M4_END:
.Lsgemm_kernel_L1_M4_END:

subs counterI, counterI, #1
bgt sgemm_kernel_L1_M4_20
bgt .Lsgemm_kernel_L1_M4_20


sgemm_kernel_L1_M2_BEGIN:
.Lsgemm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble sgemm_kernel_L1_END
ble .Lsgemm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L1_M1_BEGIN
ble .Lsgemm_kernel_L1_M1_BEGIN

sgemm_kernel_L1_M2_20:
.Lsgemm_kernel_L1_M2_20:

INIT2x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L1_M2_40
ble .Lsgemm_kernel_L1_M2_40

sgemm_kernel_L1_M2_22:
.Lsgemm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1472,43 +1472,43 @@ sgemm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L1_M2_22
bgt .Lsgemm_kernel_L1_M2_22


sgemm_kernel_L1_M2_40:
.Lsgemm_kernel_L1_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M2_100
ble .Lsgemm_kernel_L1_M2_100

sgemm_kernel_L1_M2_42:
.Lsgemm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L1_M2_42
bgt .Lsgemm_kernel_L1_M2_42

sgemm_kernel_L1_M2_100:
.Lsgemm_kernel_L1_M2_100:

SAVE2x1

sgemm_kernel_L1_M2_END:
.Lsgemm_kernel_L1_M2_END:


sgemm_kernel_L1_M1_BEGIN:
.Lsgemm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L1_END
ble .Lsgemm_kernel_L1_END

sgemm_kernel_L1_M1_20:
.Lsgemm_kernel_L1_M1_20:

INIT1x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble sgemm_kernel_L1_M1_40
ble .Lsgemm_kernel_L1_M1_40

sgemm_kernel_L1_M1_22:
.Lsgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1520,30 +1520,30 @@ sgemm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L1_M1_22
bgt .Lsgemm_kernel_L1_M1_22


sgemm_kernel_L1_M1_40:
.Lsgemm_kernel_L1_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M1_100
ble .Lsgemm_kernel_L1_M1_100

sgemm_kernel_L1_M1_42:
.Lsgemm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt sgemm_kernel_L1_M1_42
bgt .Lsgemm_kernel_L1_M1_42

sgemm_kernel_L1_M1_100:
.Lsgemm_kernel_L1_M1_100:

SAVE1x1


sgemm_kernel_L1_END:
.Lsgemm_kernel_L1_END:


sgemm_kernel_L999:
.Lsgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 241
- 241
kernel/arm64/sgemm_kernel_8x8.S
File diff suppressed because it is too large
View File


+ 221
- 221
kernel/arm64/strmm_kernel_16x4.S
File diff suppressed because it is too large
View File


+ 130
- 130
kernel/arm64/strmm_kernel_4x4.S View File

@@ -507,7 +507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

PROLOGUE

strmm_kernel_begin:
.Lstrmm_kernel_begin:

.align 5
add sp, sp, #-(11 * 16)
@@ -539,11 +539,11 @@ strmm_kernel_begin:
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble strmm_kernel_L2_BEGIN
ble .Lstrmm_kernel_L2_BEGIN

/******************************************************************************/

strmm_kernel_L4_BEGIN:
.Lstrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2

@@ -553,14 +553,14 @@ strmm_kernel_L4_BEGIN:

mov pA, origPA // pA = start of A array

strmm_kernel_L4_M4_BEGIN:
.Lstrmm_kernel_L4_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble strmm_kernel_L4_M2_BEGIN
ble .Lstrmm_kernel_L4_M2_BEGIN

strmm_kernel_L4_M4_20:
.Lstrmm_kernel_L4_M4_20:

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@@ -581,54 +581,54 @@ strmm_kernel_L4_M4_20:

asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
blt strmm_kernel_L4_M4_32
blt .Lstrmm_kernel_L4_M4_32

KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K

subs counterL, counterL, #2
ble strmm_kernel_L4_M4_22a
ble .Lstrmm_kernel_L4_M4_22a
.align 5

strmm_kernel_L4_M4_22:
.Lstrmm_kernel_L4_M4_22:

KERNEL4x4_M1
KERNEL4x4_M2

subs counterL, counterL, #1
bgt strmm_kernel_L4_M4_22
bgt .Lstrmm_kernel_L4_M4_22

strmm_kernel_L4_M4_22a:
.Lstrmm_kernel_L4_M4_22a:

KERNEL4x4_M1
KERNEL4x4_E

b strmm_kernel_L4_M4_44
b .Lstrmm_kernel_L4_M4_44

strmm_kernel_L4_M4_32:
.Lstrmm_kernel_L4_M4_32:

tst counterL, #1
ble strmm_kernel_L4_M4_40
ble .Lstrmm_kernel_L4_M4_40

KERNEL4x4_I
KERNEL4x4_E

b strmm_kernel_L4_M4_44
b .Lstrmm_kernel_L4_M4_44

strmm_kernel_L4_M4_40:
.Lstrmm_kernel_L4_M4_40:

INIT4x4

strmm_kernel_L4_M4_44:
.Lstrmm_kernel_L4_M4_44:

ands counterL , tempK, #1
ble strmm_kernel_L4_M4_100
ble .Lstrmm_kernel_L4_M4_100

strmm_kernel_L4_M4_46:
.Lstrmm_kernel_L4_M4_46:

KERNEL4x4_SUB

strmm_kernel_L4_M4_100:
.Lstrmm_kernel_L4_M4_100:

SAVE4x4

@@ -647,20 +647,20 @@ strmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4
#endif

strmm_kernel_L4_M4_END:
.Lstrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne strmm_kernel_L4_M4_20
bne .Lstrmm_kernel_L4_M4_20

strmm_kernel_L4_M2_BEGIN:
.Lstrmm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble strmm_kernel_L4_END
ble .Lstrmm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L4_M1_BEGIN
ble .Lstrmm_kernel_L4_M1_BEGIN

strmm_kernel_L4_M2_20:
.Lstrmm_kernel_L4_M2_20:

INIT2x4

@@ -684,9 +684,9 @@ strmm_kernel_L4_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L4_M2_40
ble .Lstrmm_kernel_L4_M2_40

strmm_kernel_L4_M2_22:
.Lstrmm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -699,22 +699,22 @@ strmm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L4_M2_22
bgt .Lstrmm_kernel_L4_M2_22


strmm_kernel_L4_M2_40:
.Lstrmm_kernel_L4_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L4_M2_100
ble .Lstrmm_kernel_L4_M2_100

strmm_kernel_L4_M2_42:
.Lstrmm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L4_M2_42
bgt .Lstrmm_kernel_L4_M2_42

strmm_kernel_L4_M2_100:
.Lstrmm_kernel_L4_M2_100:

SAVE2x4

@@ -735,15 +735,15 @@ strmm_kernel_L4_M2_100:
#endif


strmm_kernel_L4_M2_END:
.Lstrmm_kernel_L4_M2_END:


strmm_kernel_L4_M1_BEGIN:
.Lstrmm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L4_END
ble .Lstrmm_kernel_L4_END

strmm_kernel_L4_M1_20:
.Lstrmm_kernel_L4_M1_20:

INIT1x4

@@ -767,9 +767,9 @@ strmm_kernel_L4_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L4_M1_40
ble .Lstrmm_kernel_L4_M1_40

strmm_kernel_L4_M1_22:
.Lstrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -781,22 +781,22 @@ strmm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L4_M1_22
bgt .Lstrmm_kernel_L4_M1_22


strmm_kernel_L4_M1_40:
.Lstrmm_kernel_L4_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L4_M1_100
ble .Lstrmm_kernel_L4_M1_100

strmm_kernel_L4_M1_42:
.Lstrmm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L4_M1_42
bgt .Lstrmm_kernel_L4_M1_42

strmm_kernel_L4_M1_100:
.Lstrmm_kernel_L4_M1_100:

SAVE1x4

@@ -817,7 +817,7 @@ strmm_kernel_L4_M1_100:
#endif


strmm_kernel_L4_END:
.Lstrmm_kernel_L4_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4

#if !defined(LEFT)
@@ -825,19 +825,19 @@ strmm_kernel_L4_END:
#endif

subs counterJ, counterJ , #1 // j--
bgt strmm_kernel_L4_BEGIN
bgt .Lstrmm_kernel_L4_BEGIN


/******************************************************************************/

strmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble strmm_kernel_L999
ble .Lstrmm_kernel_L999

tst counterJ , #2
ble strmm_kernel_L1_BEGIN
ble .Lstrmm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -849,14 +849,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction

mov pA, origPA // pA = A

strmm_kernel_L2_M4_BEGIN:
.Lstrmm_kernel_L2_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble strmm_kernel_L2_M2_BEGIN
ble .Lstrmm_kernel_L2_M2_BEGIN

strmm_kernel_L2_M4_20:
.Lstrmm_kernel_L2_M4_20:

INIT4x2

@@ -880,10 +880,10 @@ strmm_kernel_L2_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble strmm_kernel_L2_M4_40
ble .Lstrmm_kernel_L2_M4_40
.align 5

strmm_kernel_L2_M4_22:
.Lstrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -895,22 +895,22 @@ strmm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L2_M4_22
bgt .Lstrmm_kernel_L2_M4_22


strmm_kernel_L2_M4_40:
.Lstrmm_kernel_L2_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M4_100
ble .Lstrmm_kernel_L2_M4_100

strmm_kernel_L2_M4_42:
.Lstrmm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L2_M4_42
bgt .Lstrmm_kernel_L2_M4_42

strmm_kernel_L2_M4_100:
.Lstrmm_kernel_L2_M4_100:

SAVE4x2

@@ -930,22 +930,22 @@ strmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif

strmm_kernel_L2_M4_END:
.Lstrmm_kernel_L2_M4_END:

subs counterI, counterI, #1
bgt strmm_kernel_L2_M4_20
bgt .Lstrmm_kernel_L2_M4_20


strmm_kernel_L2_M2_BEGIN:
.Lstrmm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble strmm_kernel_L2_END
ble .Lstrmm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L2_M1_BEGIN
ble .Lstrmm_kernel_L2_M1_BEGIN

strmm_kernel_L2_M2_20:
.Lstrmm_kernel_L2_M2_20:

INIT2x2

@@ -969,9 +969,9 @@ strmm_kernel_L2_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble strmm_kernel_L2_M2_40
ble .Lstrmm_kernel_L2_M2_40

strmm_kernel_L2_M2_22:
.Lstrmm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -984,22 +984,22 @@ strmm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L2_M2_22
bgt .Lstrmm_kernel_L2_M2_22


strmm_kernel_L2_M2_40:
.Lstrmm_kernel_L2_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M2_100
ble .Lstrmm_kernel_L2_M2_100

strmm_kernel_L2_M2_42:
.Lstrmm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L2_M2_42
bgt .Lstrmm_kernel_L2_M2_42

strmm_kernel_L2_M2_100:
.Lstrmm_kernel_L2_M2_100:

SAVE2x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1018,15 +1018,15 @@ strmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif

strmm_kernel_L2_M2_END:
.Lstrmm_kernel_L2_M2_END:


strmm_kernel_L2_M1_BEGIN:
.Lstrmm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L2_END
ble .Lstrmm_kernel_L2_END

strmm_kernel_L2_M1_20:
.Lstrmm_kernel_L2_M1_20:

INIT1x2

@@ -1050,9 +1050,9 @@ strmm_kernel_L2_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble strmm_kernel_L2_M1_40
ble .Lstrmm_kernel_L2_M1_40

strmm_kernel_L2_M1_22:
.Lstrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1064,22 +1064,22 @@ strmm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L2_M1_22
bgt .Lstrmm_kernel_L2_M1_22


strmm_kernel_L2_M1_40:
.Lstrmm_kernel_L2_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M1_100
ble .Lstrmm_kernel_L2_M1_100

strmm_kernel_L2_M1_42:
.Lstrmm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L2_M1_42
bgt .Lstrmm_kernel_L2_M1_42

strmm_kernel_L2_M1_100:
.Lstrmm_kernel_L2_M1_100:

SAVE1x2

@@ -1099,7 +1099,7 @@ strmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1
#endif

strmm_kernel_L2_END:
.Lstrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@@ -1107,11 +1107,11 @@ strmm_kernel_L2_END:

/******************************************************************************/

strmm_kernel_L1_BEGIN:
.Lstrmm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble strmm_kernel_L999 // done
ble .Lstrmm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1123,14 +1123,14 @@ strmm_kernel_L1_BEGIN:

mov pA, origPA // pA = A

strmm_kernel_L1_M4_BEGIN:
.Lstrmm_kernel_L1_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble strmm_kernel_L1_M2_BEGIN
ble .Lstrmm_kernel_L1_M2_BEGIN

strmm_kernel_L1_M4_20:
.Lstrmm_kernel_L1_M4_20:

INIT4x1

@@ -1154,10 +1154,10 @@ strmm_kernel_L1_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L1_M4_40
ble .Lstrmm_kernel_L1_M4_40
.align 5

strmm_kernel_L1_M4_22:
.Lstrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1169,22 +1169,22 @@ strmm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L1_M4_22
bgt .Lstrmm_kernel_L1_M4_22


strmm_kernel_L1_M4_40:
.Lstrmm_kernel_L1_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M4_100
ble .Lstrmm_kernel_L1_M4_100

strmm_kernel_L1_M4_42:
.Lstrmm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L1_M4_42
bgt .Lstrmm_kernel_L1_M4_42

strmm_kernel_L1_M4_100:
.Lstrmm_kernel_L1_M4_100:

SAVE4x1

@@ -1204,22 +1204,22 @@ strmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif

strmm_kernel_L1_M4_END:
.Lstrmm_kernel_L1_M4_END:

subs counterI, counterI, #1
bgt strmm_kernel_L1_M4_20
bgt .Lstrmm_kernel_L1_M4_20


strmm_kernel_L1_M2_BEGIN:
.Lstrmm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble strmm_kernel_L1_END
ble .Lstrmm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L1_M1_BEGIN
ble .Lstrmm_kernel_L1_M1_BEGIN

strmm_kernel_L1_M2_20:
.Lstrmm_kernel_L1_M2_20:

INIT2x1

@@ -1243,9 +1243,9 @@ strmm_kernel_L1_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L1_M2_40
ble .Lstrmm_kernel_L1_M2_40

strmm_kernel_L1_M2_22:
.Lstrmm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1258,22 +1258,22 @@ strmm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L1_M2_22
bgt .Lstrmm_kernel_L1_M2_22


strmm_kernel_L1_M2_40:
.Lstrmm_kernel_L1_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M2_100
ble .Lstrmm_kernel_L1_M2_100

strmm_kernel_L1_M2_42:
.Lstrmm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L1_M2_42
bgt .Lstrmm_kernel_L1_M2_42

strmm_kernel_L1_M2_100:
.Lstrmm_kernel_L1_M2_100:

SAVE2x1

@@ -1294,15 +1294,15 @@ strmm_kernel_L1_M2_100:
#endif


strmm_kernel_L1_M2_END:
.Lstrmm_kernel_L1_M2_END:


strmm_kernel_L1_M1_BEGIN:
.Lstrmm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L1_END
ble .Lstrmm_kernel_L1_END

strmm_kernel_L1_M1_20:
.Lstrmm_kernel_L1_M1_20:

INIT1x1

@@ -1326,9 +1326,9 @@ strmm_kernel_L1_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble strmm_kernel_L1_M1_40
ble .Lstrmm_kernel_L1_M1_40

strmm_kernel_L1_M1_22:
.Lstrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1340,22 +1340,22 @@ strmm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L1_M1_22
bgt .Lstrmm_kernel_L1_M1_22


strmm_kernel_L1_M1_40:
.Lstrmm_kernel_L1_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M1_100
ble .Lstrmm_kernel_L1_M1_100

strmm_kernel_L1_M1_42:
.Lstrmm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt strmm_kernel_L1_M1_42
bgt .Lstrmm_kernel_L1_M1_42

strmm_kernel_L1_M1_100:
.Lstrmm_kernel_L1_M1_100:

SAVE1x1

@@ -1377,7 +1377,7 @@ strmm_kernel_L1_M1_100:
#endif
#endif

strmm_kernel_L1_END:
.Lstrmm_kernel_L1_END:

#if 0
#if !defined(LEFT)
@@ -1385,7 +1385,7 @@ strmm_kernel_L1_END:
#endif
#endif

strmm_kernel_L999:
.Lstrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 241
- 241
kernel/arm64/strmm_kernel_8x8.S
File diff suppressed because it is too large
View File


+ 21
- 21
kernel/arm64/swap.S View File

@@ -193,50 +193,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

cmp N, xzr
ble swap_kernel_L999
ble .Lswap_kernel_L999

cmp INC_X, #1
bne swap_kernel_S_BEGIN
bne .Lswap_kernel_S_BEGIN
cmp INC_Y, #1
bne swap_kernel_S_BEGIN
bne .Lswap_kernel_S_BEGIN

swap_kernel_F_BEGIN:
.Lswap_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq swap_kernel_F1
beq .Lswap_kernel_F1

swap_kernel_F8:
.Lswap_kernel_F8:

KERNEL_F8

subs I, I, #1
bne swap_kernel_F8
bne .Lswap_kernel_F8

swap_kernel_F1:
.Lswap_kernel_F1:

ands I, N, #7
ble swap_kernel_L999
ble .Lswap_kernel_L999

swap_kernel_F10:
.Lswap_kernel_F10:

KERNEL_F1

subs I, I, #1
bne swap_kernel_F10
bne .Lswap_kernel_F10

b swap_kernel_L999
b .Lswap_kernel_L999


swap_kernel_S_BEGIN:
.Lswap_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble swap_kernel_S1
ble .Lswap_kernel_S1

swap_kernel_S4:
.Lswap_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -244,21 +244,21 @@ swap_kernel_S4:
KERNEL_S1

subs I, I, #1
bne swap_kernel_S4
bne .Lswap_kernel_S4

swap_kernel_S1:
.Lswap_kernel_S1:

ands I, N, #3
ble swap_kernel_L999
ble .Lswap_kernel_L999

swap_kernel_S10:
.Lswap_kernel_S10:

KERNEL_S1

subs I, I, #1
bne swap_kernel_S10
bne .Lswap_kernel_S10

swap_kernel_L999:
.Lswap_kernel_L999:

mov w0, wzr
ret


+ 25
- 25
kernel/arm64/zamax.S View File

@@ -184,62 +184,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

cmp N, xzr
ble amax_kernel_zero
ble .Lzamax_kernel_zero
cmp INC_X, xzr
ble amax_kernel_zero
ble .Lzamax_kernel_zero

cmp INC_X, #1
bne amax_kernel_S_BEGIN
bne .Lzamax_kernel_S_BEGIN

amax_kernel_F_BEGIN:
.Lzamax_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq amax_kernel_F1_INIT
beq .Lzamax_kernel_F1_INIT

INIT_F4
subs I, I, #1
beq amax_kernel_F1
beq .Lzamax_kernel_F1

amax_kernel_F4:
.Lzamax_kernel_F4:

KERNEL_F4

subs I, I, #1
bne amax_kernel_F4
bne .Lzamax_kernel_F4

amax_kernel_F1:
.Lzamax_kernel_F1:

ands I, N, #3
ble amax_kernel_L999
ble .Lzamax_kernel_L999

amax_kernel_F10:
.Lzamax_kernel_F10:

KERNEL_F1

subs I, I, #1
bne amax_kernel_F10
bne .Lzamax_kernel_F10

ret

amax_kernel_F1_INIT:
.Lzamax_kernel_F1_INIT:

INIT_F1
subs N, N, #1
b amax_kernel_F1
b .Lzamax_kernel_F1

amax_kernel_S_BEGIN:
.Lzamax_kernel_S_BEGIN:

INIT_S

subs N, N, #1
ble amax_kernel_L999
ble .Lzamax_kernel_L999

asr I, N, #2
cmp I, xzr
ble amax_kernel_S1
ble .Lzamax_kernel_S1

amax_kernel_S4:
.Lzamax_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -247,25 +247,25 @@ amax_kernel_S4:
KERNEL_S1

subs I, I, #1
bne amax_kernel_S4
bne .Lzamax_kernel_S4

amax_kernel_S1:
.Lzamax_kernel_S1:

ands I, N, #3
ble amax_kernel_L999
ble .Lzamax_kernel_L999

amax_kernel_S10:
.Lzamax_kernel_S10:

KERNEL_S1

subs I, I, #1
bne amax_kernel_S10
bne .Lzamax_kernel_S10

amax_kernel_L999:
.Lzamax_kernel_L999:

ret

amax_kernel_zero:
.Lzamax_kernel_zero:

fmov MAXF, REG0
ret


+ 20
- 20
kernel/arm64/zasum.S View File

@@ -92,52 +92,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov SUMF, REG0

cmp N, xzr
ble asum_kernel_L999
ble .Lzasum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999
ble .Lzasum_kernel_L999

cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lzasum_kernel_S_BEGIN

asum_kernel_F_BEGIN:
.Lzasum_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq asum_kernel_F1
beq .Lzasum_kernel_F1

asum_kernel_F4:
.Lzasum_kernel_F4:

KERNEL_F4

subs I, I, #1
bne asum_kernel_F4
bne .Lzasum_kernel_F4

KERNEL_F4_FINALIZE

asum_kernel_F1:
.Lzasum_kernel_F1:

ands I, N, #3
ble asum_kernel_L999
ble .Lzasum_kernel_L999

asum_kernel_F10:
.Lzasum_kernel_F10:

KERNEL_F1

subs I, I, #1
bne asum_kernel_F10
bne .Lzasum_kernel_F10

asum_kernel_L999:
.Lzasum_kernel_L999:
ret

asum_kernel_S_BEGIN:
.Lzasum_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble asum_kernel_S1
ble .Lzasum_kernel_S1

asum_kernel_S4:
.Lzasum_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -145,19 +145,19 @@ asum_kernel_S4:
KERNEL_S1

subs I, I, #1
bne asum_kernel_S4
bne .Lzasum_kernel_S4

asum_kernel_S1:
.Lzasum_kernel_S1:

ands I, N, #3
ble asum_kernel_L999
ble .Lzasum_kernel_L999

asum_kernel_S10:
.Lzasum_kernel_S10:

KERNEL_S1

subs I, I, #1
bne asum_kernel_S10
bne .Lzasum_kernel_S10

ret



+ 21
- 21
kernel/arm64/zaxpy.S View File

@@ -241,62 +241,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

cmp N, xzr
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999

mov Y_COPY, Y

fcmp DA_R, #0.0
bne .L1
fcmp DA_I, #0.0
beq zaxpy_kernel_L999
beq .Lzaxpy_kernel_L999

.L1:
INIT

cmp INC_X, #1
bne zaxpy_kernel_S_BEGIN
bne .Lzaxpy_kernel_S_BEGIN
cmp INC_Y, #1
bne zaxpy_kernel_S_BEGIN
bne .Lzaxpy_kernel_S_BEGIN

zaxpy_kernel_F_BEGIN:
.Lzaxpy_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq zaxpy_kernel_F1
beq .Lzaxpy_kernel_F1

KERNEL_INIT_F4

zaxpy_kernel_F4:
.Lzaxpy_kernel_F4:

KERNEL_F4

subs I, I, #1
bne zaxpy_kernel_F4
bne .Lzaxpy_kernel_F4

zaxpy_kernel_F1:
.Lzaxpy_kernel_F1:

ands I, N, #3
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999

zaxpy_kernel_F10:
.Lzaxpy_kernel_F10:

KERNEL_F1

subs I, I, #1
bne zaxpy_kernel_F10
bne .Lzaxpy_kernel_F10

mov w0, wzr
ret

zaxpy_kernel_S_BEGIN:
.Lzaxpy_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble zaxpy_kernel_S1
ble .Lzaxpy_kernel_S1

zaxpy_kernel_S4:
.Lzaxpy_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -304,21 +304,21 @@ zaxpy_kernel_S4:
KERNEL_S1

subs I, I, #1
bne zaxpy_kernel_S4
bne .Lzaxpy_kernel_S4

zaxpy_kernel_S1:
.Lzaxpy_kernel_S1:

ands I, N, #3
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999

zaxpy_kernel_S10:
.Lzaxpy_kernel_S10:

KERNEL_S1

subs I, I, #1
bne zaxpy_kernel_S10
bne .Lzaxpy_kernel_S10

zaxpy_kernel_L999:
.Lzaxpy_kernel_L999:

mov w0, wzr
ret

+ 20
- 20
kernel/arm64/zdot.S View File

@@ -229,51 +229,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif

cmp N, xzr
ble dot_kernel_L999
ble .Lzdot_kernel_L999

cmp INC_X, #1
bne dot_kernel_S_BEGIN
bne .Lzdot_kernel_S_BEGIN
cmp INC_Y, #1
bne dot_kernel_S_BEGIN
bne .Lzdot_kernel_S_BEGIN

dot_kernel_F_BEGIN:
.Lzdot_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq dot_kernel_F1
beq .Lzdot_kernel_F1

dot_kernel_F4:
.Lzdot_kernel_F4:

KERNEL_F4

subs I, I, #1
bne dot_kernel_F4
bne .Lzdot_kernel_F4

KERNEL_F4_FINALIZE

dot_kernel_F1:
.Lzdot_kernel_F1:

ands I, N, #3
ble dot_kernel_L999
ble .Lzdot_kernel_L999

dot_kernel_F10:
.Lzdot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne dot_kernel_F10
bne .Lzdot_kernel_F10

ret

dot_kernel_S_BEGIN:
.Lzdot_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble dot_kernel_S1
ble .Lzdot_kernel_S1

dot_kernel_S4:
.Lzdot_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -281,21 +281,21 @@ dot_kernel_S4:
KERNEL_S1

subs I, I, #1
bne dot_kernel_S4
bne .Lzdot_kernel_S4

dot_kernel_S1:
.Lzdot_kernel_S1:

ands I, N, #3
ble dot_kernel_L999
ble .Lzdot_kernel_L999

dot_kernel_S10:
.Lzdot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne dot_kernel_S10
bne .Lzdot_kernel_S10

dot_kernel_L999:
.Lzdot_kernel_L999:

ret



+ 130
- 130
kernel/arm64/zgemm_kernel_4x4.S View File

@@ -1099,9 +1099,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble zgemm_kernel_L2_BEGIN
ble .Lzgemm_kernel_L2_BEGIN

zgemm_kernel_L4_BEGIN:
.Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@@ -1111,20 +1111,20 @@ zgemm_kernel_L4_BEGIN:

mov pA, origPA // pA = start of A array

zgemm_kernel_L4_M4_BEGIN:
.Lzgemm_kernel_L4_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
ble .Lzgemm_kernel_L4_M2_BEGIN

.align 5
zgemm_kernel_L4_M4_20:
.Lzgemm_kernel_L4_M4_20:

mov pB, origPB
asr counterL , origK, #3
cmp counterL , #2
blt zgemm_kernel_L4_M4_32
blt .Lzgemm_kernel_L4_M4_32

KERNEL4x4_I
KERNEL4x4_M2
@@ -1136,10 +1136,10 @@ zgemm_kernel_L4_M4_20:
KERNEL4x4_M2

subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
ble .Lzgemm_kernel_L4_M4_22a

.align 5
zgemm_kernel_L4_M4_22:
.Lzgemm_kernel_L4_M4_22:

KERNEL4x4_M1
KERNEL4x4_M2
@@ -1151,10 +1151,10 @@ zgemm_kernel_L4_M4_22:
KERNEL4x4_M2

subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
bgt .Lzgemm_kernel_L4_M4_22

.align 5
zgemm_kernel_L4_M4_22a:
.Lzgemm_kernel_L4_M4_22a:

KERNEL4x4_M1
KERNEL4x4_M2
@@ -1165,13 +1165,13 @@ zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E

b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44

.align 5
zgemm_kernel_L4_M4_32:
.Lzgemm_kernel_L4_M4_32:

tst counterL, #1
ble zgemm_kernel_L4_M4_40
ble .Lzgemm_kernel_L4_M4_40

KERNEL4x4_I
KERNEL4x4_M2
@@ -1182,55 +1182,55 @@ zgemm_kernel_L4_M4_32:
KERNEL4x4_M1
KERNEL4x4_E

b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44


zgemm_kernel_L4_M4_40:
.Lzgemm_kernel_L4_M4_40:

INIT4x4

zgemm_kernel_L4_M4_44:
.Lzgemm_kernel_L4_M4_44:

ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
ble .Lzgemm_kernel_L4_M4_100

.align 5
zgemm_kernel_L4_M4_46:
.Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB

subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
bne .Lzgemm_kernel_L4_M4_46

zgemm_kernel_L4_M4_100:
.Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]

SAVE4x4

zgemm_kernel_L4_M4_END:
.Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne zgemm_kernel_L4_M4_20
bne .Lzgemm_kernel_L4_M4_20

zgemm_kernel_L4_M2_BEGIN:
.Lzgemm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L4_M1_BEGIN
ble .Lzgemm_kernel_L4_M1_BEGIN

zgemm_kernel_L4_M2_20:
.Lzgemm_kernel_L4_M2_20:

INIT2x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L4_M2_40
ble .Lzgemm_kernel_L4_M2_40

zgemm_kernel_L4_M2_22:
.Lzgemm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -1243,43 +1243,43 @@ zgemm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_22
bgt .Lzgemm_kernel_L4_M2_22


zgemm_kernel_L4_M2_40:
.Lzgemm_kernel_L4_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M2_100
ble .Lzgemm_kernel_L4_M2_100

zgemm_kernel_L4_M2_42:
.Lzgemm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_42
bgt .Lzgemm_kernel_L4_M2_42

zgemm_kernel_L4_M2_100:
.Lzgemm_kernel_L4_M2_100:

SAVE2x4

zgemm_kernel_L4_M2_END:
.Lzgemm_kernel_L4_M2_END:


zgemm_kernel_L4_M1_BEGIN:
.Lzgemm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END

zgemm_kernel_L4_M1_20:
.Lzgemm_kernel_L4_M1_20:

INIT1x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L4_M1_40
ble .Lzgemm_kernel_L4_M1_40

zgemm_kernel_L4_M1_22:
.Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1291,45 +1291,45 @@ zgemm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_22
bgt .Lzgemm_kernel_L4_M1_22


zgemm_kernel_L4_M1_40:
.Lzgemm_kernel_L4_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M1_100
ble .Lzgemm_kernel_L4_M1_100

zgemm_kernel_L4_M1_42:
.Lzgemm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_42
bgt .Lzgemm_kernel_L4_M1_42

zgemm_kernel_L4_M1_100:
.Lzgemm_kernel_L4_M1_100:

SAVE1x4


zgemm_kernel_L4_END:
.Lzgemm_kernel_L4_END:

lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2

subs counterJ, counterJ , #1 // j--
bgt zgemm_kernel_L4_BEGIN
bgt .Lzgemm_kernel_L4_BEGIN


/******************************************************************************/

zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble zgemm_kernel_L999
ble .Lzgemm_kernel_L999

tst counterJ , #2
ble zgemm_kernel_L1_BEGIN
ble .Lzgemm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1339,24 +1339,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction



zgemm_kernel_L2_M4_BEGIN:
.Lzgemm_kernel_L2_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble zgemm_kernel_L2_M2_BEGIN
ble .Lzgemm_kernel_L2_M2_BEGIN

zgemm_kernel_L2_M4_20:
.Lzgemm_kernel_L2_M4_20:

INIT4x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble zgemm_kernel_L2_M4_40
ble .Lzgemm_kernel_L2_M4_40
.align 5

zgemm_kernel_L2_M4_22:
.Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1368,50 +1368,50 @@ zgemm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_22
bgt .Lzgemm_kernel_L2_M4_22


zgemm_kernel_L2_M4_40:
.Lzgemm_kernel_L2_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M4_100
ble .Lzgemm_kernel_L2_M4_100

zgemm_kernel_L2_M4_42:
.Lzgemm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_42
bgt .Lzgemm_kernel_L2_M4_42

zgemm_kernel_L2_M4_100:
.Lzgemm_kernel_L2_M4_100:

SAVE4x2

zgemm_kernel_L2_M4_END:
.Lzgemm_kernel_L2_M4_END:

subs counterI, counterI, #1
bgt zgemm_kernel_L2_M4_20
bgt .Lzgemm_kernel_L2_M4_20


zgemm_kernel_L2_M2_BEGIN:
.Lzgemm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L2_M1_BEGIN
ble .Lzgemm_kernel_L2_M1_BEGIN

zgemm_kernel_L2_M2_20:
.Lzgemm_kernel_L2_M2_20:

INIT2x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble zgemm_kernel_L2_M2_40
ble .Lzgemm_kernel_L2_M2_40

zgemm_kernel_L2_M2_22:
.Lzgemm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1424,43 +1424,43 @@ zgemm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_22
bgt .Lzgemm_kernel_L2_M2_22


zgemm_kernel_L2_M2_40:
.Lzgemm_kernel_L2_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M2_100
ble .Lzgemm_kernel_L2_M2_100

zgemm_kernel_L2_M2_42:
.Lzgemm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_42
bgt .Lzgemm_kernel_L2_M2_42

zgemm_kernel_L2_M2_100:
.Lzgemm_kernel_L2_M2_100:

SAVE2x2

zgemm_kernel_L2_M2_END:
.Lzgemm_kernel_L2_M2_END:


zgemm_kernel_L2_M1_BEGIN:
.Lzgemm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END

zgemm_kernel_L2_M1_20:
.Lzgemm_kernel_L2_M1_20:

INIT1x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble zgemm_kernel_L2_M1_40
ble .Lzgemm_kernel_L2_M1_40

zgemm_kernel_L2_M1_22:
.Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1472,37 +1472,37 @@ zgemm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_22
bgt .Lzgemm_kernel_L2_M1_22


zgemm_kernel_L2_M1_40:
.Lzgemm_kernel_L2_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M1_100
ble .Lzgemm_kernel_L2_M1_100

zgemm_kernel_L2_M1_42:
.Lzgemm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_42
bgt .Lzgemm_kernel_L2_M1_42

zgemm_kernel_L2_M1_100:
.Lzgemm_kernel_L2_M1_100:

SAVE1x2


zgemm_kernel_L2_END:
.Lzgemm_kernel_L2_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2

/******************************************************************************/

zgemm_kernel_L1_BEGIN:
.Lzgemm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble zgemm_kernel_L999 // done
ble .Lzgemm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1512,24 +1512,24 @@ zgemm_kernel_L1_BEGIN:



zgemm_kernel_L1_M4_BEGIN:
.Lzgemm_kernel_L1_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble zgemm_kernel_L1_M2_BEGIN
ble .Lzgemm_kernel_L1_M2_BEGIN

zgemm_kernel_L1_M4_20:
.Lzgemm_kernel_L1_M4_20:

INIT4x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M4_40
ble .Lzgemm_kernel_L1_M4_40
.align 5

zgemm_kernel_L1_M4_22:
.Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1541,50 +1541,50 @@ zgemm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_22
bgt .Lzgemm_kernel_L1_M4_22


zgemm_kernel_L1_M4_40:
.Lzgemm_kernel_L1_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M4_100
ble .Lzgemm_kernel_L1_M4_100

zgemm_kernel_L1_M4_42:
.Lzgemm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_42
bgt .Lzgemm_kernel_L1_M4_42

zgemm_kernel_L1_M4_100:
.Lzgemm_kernel_L1_M4_100:

SAVE4x1

zgemm_kernel_L1_M4_END:
.Lzgemm_kernel_L1_M4_END:

subs counterI, counterI, #1
bgt zgemm_kernel_L1_M4_20
bgt .Lzgemm_kernel_L1_M4_20


zgemm_kernel_L1_M2_BEGIN:
.Lzgemm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L1_M1_BEGIN
ble .Lzgemm_kernel_L1_M1_BEGIN

zgemm_kernel_L1_M2_20:
.Lzgemm_kernel_L1_M2_20:

INIT2x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M2_40
ble .Lzgemm_kernel_L1_M2_40

zgemm_kernel_L1_M2_22:
.Lzgemm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1597,43 +1597,43 @@ zgemm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_22
bgt .Lzgemm_kernel_L1_M2_22


zgemm_kernel_L1_M2_40:
.Lzgemm_kernel_L1_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M2_100
ble .Lzgemm_kernel_L1_M2_100

zgemm_kernel_L1_M2_42:
.Lzgemm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_42
bgt .Lzgemm_kernel_L1_M2_42

zgemm_kernel_L1_M2_100:
.Lzgemm_kernel_L1_M2_100:

SAVE2x1

zgemm_kernel_L1_M2_END:
.Lzgemm_kernel_L1_M2_END:


zgemm_kernel_L1_M1_BEGIN:
.Lzgemm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END

zgemm_kernel_L1_M1_20:
.Lzgemm_kernel_L1_M1_20:

INIT1x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M1_40
ble .Lzgemm_kernel_L1_M1_40

zgemm_kernel_L1_M1_22:
.Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1645,30 +1645,30 @@ zgemm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_22
bgt .Lzgemm_kernel_L1_M1_22


zgemm_kernel_L1_M1_40:
.Lzgemm_kernel_L1_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M1_100
ble .Lzgemm_kernel_L1_M1_100

zgemm_kernel_L1_M1_42:
.Lzgemm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_42
bgt .Lzgemm_kernel_L1_M1_42

zgemm_kernel_L1_M1_100:
.Lzgemm_kernel_L1_M1_100:

SAVE1x1


zgemm_kernel_L1_END:
.Lzgemm_kernel_L1_END:


zgemm_kernel_L999:
.Lzgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 130
- 130
kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S View File

@@ -1109,9 +1109,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble zgemm_kernel_L2_BEGIN
ble .Lzgemm_kernel_L2_BEGIN

zgemm_kernel_L4_BEGIN:
.Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@@ -1121,20 +1121,20 @@ zgemm_kernel_L4_BEGIN:

mov pA, origPA // pA = start of A array

zgemm_kernel_L4_M4_BEGIN:
.Lzgemm_kernel_L4_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
ble .Lzgemm_kernel_L4_M2_BEGIN

.align 5
zgemm_kernel_L4_M4_20:
.Lzgemm_kernel_L4_M4_20:

mov pB, origPB
asr counterL , origK, #3
cmp counterL , #2
blt zgemm_kernel_L4_M4_32
blt .Lzgemm_kernel_L4_M4_32

KERNEL4x4_I
KERNEL4x4_M2
@@ -1146,10 +1146,10 @@ zgemm_kernel_L4_M4_20:
KERNEL4x4_M2

subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
ble .Lzgemm_kernel_L4_M4_22a

.align 5
zgemm_kernel_L4_M4_22:
.Lzgemm_kernel_L4_M4_22:

KERNEL4x4_M1
KERNEL4x4_M2
@@ -1161,10 +1161,10 @@ zgemm_kernel_L4_M4_22:
KERNEL4x4_M2

subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
bgt .Lzgemm_kernel_L4_M4_22

.align 5
zgemm_kernel_L4_M4_22a:
.Lzgemm_kernel_L4_M4_22a:

KERNEL4x4_M1
KERNEL4x4_M2
@@ -1175,13 +1175,13 @@ zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E

b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44

.align 5
zgemm_kernel_L4_M4_32:
.Lzgemm_kernel_L4_M4_32:

tst counterL, #1
ble zgemm_kernel_L4_M4_40
ble .Lzgemm_kernel_L4_M4_40

KERNEL4x4_I
KERNEL4x4_M2
@@ -1192,55 +1192,55 @@ zgemm_kernel_L4_M4_32:
KERNEL4x4_M1
KERNEL4x4_E

b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44


zgemm_kernel_L4_M4_40:
.Lzgemm_kernel_L4_M4_40:

INIT4x4

zgemm_kernel_L4_M4_44:
.Lzgemm_kernel_L4_M4_44:

ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
ble .Lzgemm_kernel_L4_M4_100

.align 5
zgemm_kernel_L4_M4_46:
.Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB

subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
bne .Lzgemm_kernel_L4_M4_46

zgemm_kernel_L4_M4_100:
.Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]

SAVE4x4

zgemm_kernel_L4_M4_END:
.Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne zgemm_kernel_L4_M4_20
bne .Lzgemm_kernel_L4_M4_20

zgemm_kernel_L4_M2_BEGIN:
.Lzgemm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L4_M1_BEGIN
ble .Lzgemm_kernel_L4_M1_BEGIN

zgemm_kernel_L4_M2_20:
.Lzgemm_kernel_L4_M2_20:

INIT2x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L4_M2_40
ble .Lzgemm_kernel_L4_M2_40

zgemm_kernel_L4_M2_22:
.Lzgemm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -1253,43 +1253,43 @@ zgemm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_22
bgt .Lzgemm_kernel_L4_M2_22


zgemm_kernel_L4_M2_40:
.Lzgemm_kernel_L4_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M2_100
ble .Lzgemm_kernel_L4_M2_100

zgemm_kernel_L4_M2_42:
.Lzgemm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_42
bgt .Lzgemm_kernel_L4_M2_42

zgemm_kernel_L4_M2_100:
.Lzgemm_kernel_L4_M2_100:

SAVE2x4

zgemm_kernel_L4_M2_END:
.Lzgemm_kernel_L4_M2_END:


zgemm_kernel_L4_M1_BEGIN:
.Lzgemm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END

zgemm_kernel_L4_M1_20:
.Lzgemm_kernel_L4_M1_20:

INIT1x4

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L4_M1_40
ble .Lzgemm_kernel_L4_M1_40

zgemm_kernel_L4_M1_22:
.Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1301,45 +1301,45 @@ zgemm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_22
bgt .Lzgemm_kernel_L4_M1_22


zgemm_kernel_L4_M1_40:
.Lzgemm_kernel_L4_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M1_100
ble .Lzgemm_kernel_L4_M1_100

zgemm_kernel_L4_M1_42:
.Lzgemm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_42
bgt .Lzgemm_kernel_L4_M1_42

zgemm_kernel_L4_M1_100:
.Lzgemm_kernel_L4_M1_100:

SAVE1x4


zgemm_kernel_L4_END:
.Lzgemm_kernel_L4_END:

lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2

subs counterJ, counterJ , #1 // j--
bgt zgemm_kernel_L4_BEGIN
bgt .Lzgemm_kernel_L4_BEGIN


/******************************************************************************/

zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble zgemm_kernel_L999
ble .Lzgemm_kernel_L999

tst counterJ , #2
ble zgemm_kernel_L1_BEGIN
ble .Lzgemm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1349,24 +1349,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction



zgemm_kernel_L2_M4_BEGIN:
.Lzgemm_kernel_L2_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble zgemm_kernel_L2_M2_BEGIN
ble .Lzgemm_kernel_L2_M2_BEGIN

zgemm_kernel_L2_M4_20:
.Lzgemm_kernel_L2_M4_20:

INIT4x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble zgemm_kernel_L2_M4_40
ble .Lzgemm_kernel_L2_M4_40
.align 5

zgemm_kernel_L2_M4_22:
.Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1378,50 +1378,50 @@ zgemm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_22
bgt .Lzgemm_kernel_L2_M4_22


zgemm_kernel_L2_M4_40:
.Lzgemm_kernel_L2_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M4_100
ble .Lzgemm_kernel_L2_M4_100

zgemm_kernel_L2_M4_42:
.Lzgemm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_42
bgt .Lzgemm_kernel_L2_M4_42

zgemm_kernel_L2_M4_100:
.Lzgemm_kernel_L2_M4_100:

SAVE4x2

zgemm_kernel_L2_M4_END:
.Lzgemm_kernel_L2_M4_END:

subs counterI, counterI, #1
bgt zgemm_kernel_L2_M4_20
bgt .Lzgemm_kernel_L2_M4_20


zgemm_kernel_L2_M2_BEGIN:
.Lzgemm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L2_M1_BEGIN
ble .Lzgemm_kernel_L2_M1_BEGIN

zgemm_kernel_L2_M2_20:
.Lzgemm_kernel_L2_M2_20:

INIT2x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble zgemm_kernel_L2_M2_40
ble .Lzgemm_kernel_L2_M2_40

zgemm_kernel_L2_M2_22:
.Lzgemm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1434,43 +1434,43 @@ zgemm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_22
bgt .Lzgemm_kernel_L2_M2_22


zgemm_kernel_L2_M2_40:
.Lzgemm_kernel_L2_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M2_100
ble .Lzgemm_kernel_L2_M2_100

zgemm_kernel_L2_M2_42:
.Lzgemm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_42
bgt .Lzgemm_kernel_L2_M2_42

zgemm_kernel_L2_M2_100:
.Lzgemm_kernel_L2_M2_100:

SAVE2x2

zgemm_kernel_L2_M2_END:
.Lzgemm_kernel_L2_M2_END:


zgemm_kernel_L2_M1_BEGIN:
.Lzgemm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END

zgemm_kernel_L2_M1_20:
.Lzgemm_kernel_L2_M1_20:

INIT1x2

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
ble zgemm_kernel_L2_M1_40
ble .Lzgemm_kernel_L2_M1_40

zgemm_kernel_L2_M1_22:
.Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1482,37 +1482,37 @@ zgemm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_22
bgt .Lzgemm_kernel_L2_M1_22


zgemm_kernel_L2_M1_40:
.Lzgemm_kernel_L2_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M1_100
ble .Lzgemm_kernel_L2_M1_100

zgemm_kernel_L2_M1_42:
.Lzgemm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_42
bgt .Lzgemm_kernel_L2_M1_42

zgemm_kernel_L2_M1_100:
.Lzgemm_kernel_L2_M1_100:

SAVE1x2


zgemm_kernel_L2_END:
.Lzgemm_kernel_L2_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2

/******************************************************************************/

zgemm_kernel_L1_BEGIN:
.Lzgemm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble zgemm_kernel_L999 // done
ble .Lzgemm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1522,24 +1522,24 @@ zgemm_kernel_L1_BEGIN:



zgemm_kernel_L1_M4_BEGIN:
.Lzgemm_kernel_L1_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble zgemm_kernel_L1_M2_BEGIN
ble .Lzgemm_kernel_L1_M2_BEGIN

zgemm_kernel_L1_M4_20:
.Lzgemm_kernel_L1_M4_20:

INIT4x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M4_40
ble .Lzgemm_kernel_L1_M4_40
.align 5

zgemm_kernel_L1_M4_22:
.Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1551,50 +1551,50 @@ zgemm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_22
bgt .Lzgemm_kernel_L1_M4_22


zgemm_kernel_L1_M4_40:
.Lzgemm_kernel_L1_M4_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M4_100
ble .Lzgemm_kernel_L1_M4_100

zgemm_kernel_L1_M4_42:
.Lzgemm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_42
bgt .Lzgemm_kernel_L1_M4_42

zgemm_kernel_L1_M4_100:
.Lzgemm_kernel_L1_M4_100:

SAVE4x1

zgemm_kernel_L1_M4_END:
.Lzgemm_kernel_L1_M4_END:

subs counterI, counterI, #1
bgt zgemm_kernel_L1_M4_20
bgt .Lzgemm_kernel_L1_M4_20


zgemm_kernel_L1_M2_BEGIN:
.Lzgemm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L1_M1_BEGIN
ble .Lzgemm_kernel_L1_M1_BEGIN

zgemm_kernel_L1_M2_20:
.Lzgemm_kernel_L1_M2_20:

INIT2x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M2_40
ble .Lzgemm_kernel_L1_M2_40

zgemm_kernel_L1_M2_22:
.Lzgemm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1607,43 +1607,43 @@ zgemm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_22
bgt .Lzgemm_kernel_L1_M2_22


zgemm_kernel_L1_M2_40:
.Lzgemm_kernel_L1_M2_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M2_100
ble .Lzgemm_kernel_L1_M2_100

zgemm_kernel_L1_M2_42:
.Lzgemm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_42
bgt .Lzgemm_kernel_L1_M2_42

zgemm_kernel_L1_M2_100:
.Lzgemm_kernel_L1_M2_100:

SAVE2x1

zgemm_kernel_L1_M2_END:
.Lzgemm_kernel_L1_M2_END:


zgemm_kernel_L1_M1_BEGIN:
.Lzgemm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END

zgemm_kernel_L1_M1_20:
.Lzgemm_kernel_L1_M1_20:

INIT1x1

mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble zgemm_kernel_L1_M1_40
ble .Lzgemm_kernel_L1_M1_40

zgemm_kernel_L1_M1_22:
.Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1655,30 +1655,30 @@ zgemm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_22
bgt .Lzgemm_kernel_L1_M1_22


zgemm_kernel_L1_M1_40:
.Lzgemm_kernel_L1_M1_40:

ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M1_100
ble .Lzgemm_kernel_L1_M1_100

zgemm_kernel_L1_M1_42:
.Lzgemm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_42
bgt .Lzgemm_kernel_L1_M1_42

zgemm_kernel_L1_M1_100:
.Lzgemm_kernel_L1_M1_100:

SAVE1x1


zgemm_kernel_L1_END:
.Lzgemm_kernel_L1_END:


zgemm_kernel_L999:
.Lzgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


+ 26
- 26
kernel/arm64/zgemv_n.S View File

@@ -364,9 +364,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS

cmp N, xzr
ble zgemv_n_kernel_L999
ble .Lzgemv_n_kernel_L999
cmp M, xzr
ble zgemv_n_kernel_L999
ble .Lzgemv_n_kernel_L999

lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ
@@ -375,9 +375,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INIT

cmp INC_Y, #1
bne zgemv_n_kernel_S_BEGIN
bne .Lzgemv_n_kernel_S_BEGIN

zgemv_n_kernel_F_LOOP:
.Lzgemv_n_kernel_F_LOOP:
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
@@ -387,40 +387,40 @@ zgemv_n_kernel_F_LOOP:

asr I, M, #2
cmp I, xzr
beq zgemv_n_kernel_F1
beq .Lzgemv_n_kernel_F1

zgemv_n_kernel_F4:
.Lzgemv_n_kernel_F4:

KERNEL_F4

subs I, I, #1
bne zgemv_n_kernel_F4
bne .Lzgemv_n_kernel_F4

zgemv_n_kernel_F1:
.Lzgemv_n_kernel_F1:

ands I, M, #3
ble zgemv_n_kernel_F_END
ble .Lzgemv_n_kernel_F_END

zgemv_n_kernel_F10:
.Lzgemv_n_kernel_F10:

KERNEL_F1

subs I, I, #1
bne zgemv_n_kernel_F10
bne .Lzgemv_n_kernel_F10

zgemv_n_kernel_F_END:
.Lzgemv_n_kernel_F_END:

add A, A, LDA
subs J, J, #1
bne zgemv_n_kernel_F_LOOP
bne .Lzgemv_n_kernel_F_LOOP

b zgemv_n_kernel_L999
b .Lzgemv_n_kernel_L999

zgemv_n_kernel_S_BEGIN:
.Lzgemv_n_kernel_S_BEGIN:

INIT_S

zgemv_n_kernel_S_LOOP:
.Lzgemv_n_kernel_S_LOOP:
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
@@ -430,9 +430,9 @@ zgemv_n_kernel_S_LOOP:

asr I, M, #2
cmp I, xzr
ble zgemv_n_kernel_S1
ble .Lzgemv_n_kernel_S1

zgemv_n_kernel_S4:
.Lzgemv_n_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -440,27 +440,27 @@ zgemv_n_kernel_S4:
KERNEL_S1

subs I, I, #1
bne zgemv_n_kernel_S4
bne .Lzgemv_n_kernel_S4

zgemv_n_kernel_S1:
.Lzgemv_n_kernel_S1:

ands I, M, #3
ble zgemv_n_kernel_S_END
ble .Lzgemv_n_kernel_S_END

zgemv_n_kernel_S10:
.Lzgemv_n_kernel_S10:

KERNEL_S1

subs I, I, #1
bne zgemv_n_kernel_S10
bne .Lzgemv_n_kernel_S10

zgemv_n_kernel_S_END:
.Lzgemv_n_kernel_S_END:

add A, A, LDA
subs J, J, #1
bne zgemv_n_kernel_S_LOOP
bne .Lzgemv_n_kernel_S_LOOP

zgemv_n_kernel_L999:
.Lzgemv_n_kernel_L999:
RESTORE_REGS

mov w0, wzr


+ 26
- 26
kernel/arm64/zgemv_t.S View File

@@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS

cmp N, xzr
ble zgemv_t_kernel_L999
ble .Lzgemv_t_kernel_L999
cmp M, xzr
ble zgemv_t_kernel_L999
ble .Lzgemv_t_kernel_L999

lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ
@@ -303,9 +303,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INIT

cmp INC_X, #1
bne zgemv_t_kernel_S_BEGIN
bne .Lzgemv_t_kernel_S_BEGIN

zgemv_t_kernel_F_LOOP:
.Lzgemv_t_kernel_F_LOOP:

mov A_PTR, A
mov X_PTR, X
@@ -314,30 +314,30 @@ zgemv_t_kernel_F_LOOP:

asr I, M, #2
cmp I, xzr
beq zgemv_t_kernel_F1
beq .Lzgemv_t_kernel_F1

zgemv_t_kernel_F4:
.Lzgemv_t_kernel_F4:

KERNEL_F4

subs I, I, #1
bne zgemv_t_kernel_F4
bne .Lzgemv_t_kernel_F4

KERNEL_F4_FINALIZE

zgemv_t_kernel_F1:
.Lzgemv_t_kernel_F1:

ands I, M, #3
ble zgemv_t_kernel_F_END
ble .Lzgemv_t_kernel_F_END

zgemv_t_kernel_F10:
.Lzgemv_t_kernel_F10:

KERNEL_F1

subs I, I, #1
bne zgemv_t_kernel_F10
bne .Lzgemv_t_kernel_F10

zgemv_t_kernel_F_END:
.Lzgemv_t_kernel_F_END:

#if !defined(DOUBLE)
ld1 {v4.2s}, [Y]
@@ -355,15 +355,15 @@ zgemv_t_kernel_F_END:

add A, A, LDA
subs J, J, #1
bne zgemv_t_kernel_F_LOOP
bne .Lzgemv_t_kernel_F_LOOP

b zgemv_t_kernel_L999
b .Lzgemv_t_kernel_L999

zgemv_t_kernel_S_BEGIN:
.Lzgemv_t_kernel_S_BEGIN:

INIT_S

zgemv_t_kernel_S_LOOP:
.Lzgemv_t_kernel_S_LOOP:

mov A_PTR, A
mov X_PTR, X
@@ -371,9 +371,9 @@ zgemv_t_kernel_S_LOOP:

asr I, M, #2
cmp I, xzr
ble zgemv_t_kernel_S1
ble .Lzgemv_t_kernel_S1

zgemv_t_kernel_S4:
.Lzgemv_t_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -381,21 +381,21 @@ zgemv_t_kernel_S4:
KERNEL_S1

subs I, I, #1
bne zgemv_t_kernel_S4
bne .Lzgemv_t_kernel_S4

zgemv_t_kernel_S1:
.Lzgemv_t_kernel_S1:

ands I, M, #3
ble zgemv_t_kernel_S_END
ble .Lzgemv_t_kernel_S_END

zgemv_t_kernel_S10:
.Lzgemv_t_kernel_S10:

KERNEL_S1

subs I, I, #1
bne zgemv_t_kernel_S10
bne .Lzgemv_t_kernel_S10

zgemv_t_kernel_S_END:
.Lzgemv_t_kernel_S_END:

#if !defined(DOUBLE)
ld1 {v4.2s}, [Y]
@@ -413,9 +413,9 @@ zgemv_t_kernel_S_END:

add A, A, LDA
subs J, J, #1
bne zgemv_t_kernel_S_LOOP
bne .Lzgemv_t_kernel_S_LOOP

zgemv_t_kernel_L999:
.Lzgemv_t_kernel_L999:
RESTORE_REGS
mov w0, wzr
ret


+ 16
- 16
kernel/arm64/znrm2.S View File

@@ -226,43 +226,43 @@ KERNEL_S1_END_\@:
INIT

cmp N, #0
ble nrm2_kernel_L999
ble .Lznrm2_kernel_L999

cmp INC_X, #0
beq nrm2_kernel_L999
beq .Lznrm2_kernel_L999

cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
bne .Lznrm2_kernel_S_BEGIN

nrm2_kernel_F_BEGIN:
.Lznrm2_kernel_F_BEGIN:

asr I, N, #3 // I = N / 8
cmp I, xzr
ble nrm2_kernel_F1
ble .Lznrm2_kernel_F1

nrm2_kernel_F8:
.Lznrm2_kernel_F8:

KERNEL_F8

subs I, I, #1
bne nrm2_kernel_F8
bne .Lznrm2_kernel_F8

nrm2_kernel_F1:
.Lznrm2_kernel_F1:

ands I, N, #7
ble nrm2_kernel_L999
ble .Lznrm2_kernel_L999


nrm2_kernel_F10:
.Lznrm2_kernel_F10:

KERNEL_F1

subs I, I, #1
bne nrm2_kernel_F10
bne .Lznrm2_kernel_F10

b nrm2_kernel_L999
b .Lznrm2_kernel_L999

nrm2_kernel_S_BEGIN:
.Lznrm2_kernel_S_BEGIN:

INIT_S

@@ -270,15 +270,15 @@ nrm2_kernel_S_BEGIN:

.align 5

nrm2_kernel_S10:
.Lznrm2_kernel_S10:

KERNEL_S1

subs I, I, #1
bne nrm2_kernel_S10
bne .Lznrm2_kernel_S10


nrm2_kernel_L999:
.Lznrm2_kernel_L999:
fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ



+ 20
- 20
kernel/arm64/zrot.S View File

@@ -181,54 +181,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

cmp N, xzr
ble rot_kernel_L999
ble .Lzrot_kernel_L999

INIT

cmp INC_X, #1
bne rot_kernel_S_BEGIN
bne .Lzrot_kernel_S_BEGIN
cmp INC_Y, #1
bne rot_kernel_S_BEGIN
bne .Lzrot_kernel_S_BEGIN

rot_kernel_F_BEGIN:
.Lzrot_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq rot_kernel_F1
beq .Lzrot_kernel_F1

KERNEL_INIT_F4

rot_kernel_F4:
.Lzrot_kernel_F4:

KERNEL_F4

subs I, I, #1
bne rot_kernel_F4
bne .Lzrot_kernel_F4

rot_kernel_F1:
.Lzrot_kernel_F1:

ands I, N, #3
ble rot_kernel_L999
ble .Lzrot_kernel_L999

rot_kernel_F10:
.Lzrot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne rot_kernel_F10
bne .Lzrot_kernel_F10

mov w0, wzr
ret

rot_kernel_S_BEGIN:
.Lzrot_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble rot_kernel_S1
ble .Lzrot_kernel_S1

rot_kernel_S4:
.Lzrot_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -236,21 +236,21 @@ rot_kernel_S4:
KERNEL_S1

subs I, I, #1
bne rot_kernel_S4
bne .Lzrot_kernel_S4

rot_kernel_S1:
.Lzrot_kernel_S1:

ands I, N, #3
ble rot_kernel_L999
ble .Lzrot_kernel_L999

rot_kernel_S10:
.Lzrot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne rot_kernel_S10
bne .Lzrot_kernel_S10

rot_kernel_L999:
.Lzrot_kernel_L999:

mov w0, wzr
ret

+ 34
- 34
kernel/arm64/zscal.S View File

@@ -215,71 +215,71 @@ zscal_begin:
mov X_COPY, X

cmp N, xzr
ble zscal_kernel_L999
ble .Lzscal_kernel_L999

fcmp DA_R, #0.0
bne zscal_kernel_R_non_zero
bne .Lzscal_kernel_R_non_zero

fcmp DA_I, #0.0
beq zscal_kernel_RI_zero
beq .Lzscal_kernel_RI_zero

b zscal_kernel_R_zero
b .Lzscal_kernel_R_zero

zscal_kernel_R_non_zero:
.Lzscal_kernel_R_non_zero:

fcmp DA_I, #0.0
beq zscal_kernel_I_zero
beq .Lzscal_kernel_I_zero

/*******************************************************************************
* A_R != 0 && A_I != 0
*******************************************************************************/

zscal_kernel_RI_non_zero:
.Lzscal_kernel_RI_non_zero:

INIT

cmp INC_X, #1
bne zscal_kernel_S_BEGIN
bne .Lzscal_kernel_S_BEGIN

zscal_kernel_F_BEGIN:
.Lzscal_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq zscal_kernel_F1
beq .Lzscal_kernel_F1

KERNEL_INIT_F4

zscal_kernel_F4:
.Lzscal_kernel_F4:

KERNEL_F4

subs I, I, #1
bne zscal_kernel_F4
bne .Lzscal_kernel_F4

zscal_kernel_F1:
.Lzscal_kernel_F1:

ands I, N, #3
ble zscal_kernel_L999
ble .Lzscal_kernel_L999

zscal_kernel_F10:
.Lzscal_kernel_F10:

KERNEL_F1

subs I, I, #1
bne zscal_kernel_F10
bne .Lzscal_kernel_F10

mov w0, wzr
ret

zscal_kernel_S_BEGIN:
.Lzscal_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble zscal_kernel_S1
ble .Lzscal_kernel_S1

zscal_kernel_S4:
.Lzscal_kernel_S4:

KERNEL_S1
KERNEL_S1
@@ -287,21 +287,21 @@ zscal_kernel_S4:
KERNEL_S1

subs I, I, #1
bne zscal_kernel_S4
bne .Lzscal_kernel_S4

zscal_kernel_S1:
.Lzscal_kernel_S1:

ands I, N, #3
ble zscal_kernel_L999
ble .Lzscal_kernel_L999

zscal_kernel_S10:
.Lzscal_kernel_S10:

KERNEL_S1

subs I, I, #1
bne zscal_kernel_S10
bne .Lzscal_kernel_S10

zscal_kernel_L999:
.Lzscal_kernel_L999:

mov w0, wzr
ret
@@ -310,7 +310,7 @@ zscal_kernel_L999:
* A_R == 0 && A_I != 0
*******************************************************************************/

zscal_kernel_R_zero:
.Lzscal_kernel_R_zero:
INIT_S

#if !defined(DOUBLE)
@@ -323,7 +323,7 @@ zscal_kernel_R_zero:
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
#endif

zscal_kernel_R_zero_1:
.Lzscal_kernel_R_zero_1:
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0
@@ -337,7 +337,7 @@ zscal_kernel_R_zero_1:
#endif
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_R_zero_1
bne .Lzscal_kernel_R_zero_1

mov w0, wzr
ret
@@ -346,7 +346,7 @@ zscal_kernel_R_zero_1:
* A_R != 0 && A_I == 0
*******************************************************************************/

zscal_kernel_I_zero:
.Lzscal_kernel_I_zero:
INIT_S
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
@@ -354,7 +354,7 @@ zscal_kernel_I_zero:
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
#endif

zscal_kernel_I_zero_1:
.Lzscal_kernel_I_zero_1:
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
@@ -366,7 +366,7 @@ zscal_kernel_I_zero_1:
#endif
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_I_zero_1
bne .Lzscal_kernel_I_zero_1

mov w0, wzr
ret
@@ -375,16 +375,16 @@ zscal_kernel_I_zero_1:
* A_R == 0 && A_I == 0
*******************************************************************************/

zscal_kernel_RI_zero:
.Lzscal_kernel_RI_zero:

INIT_S

zscal_kernel_RI_zero_1:
.Lzscal_kernel_RI_zero_1:

stp DA_R, DA_I, [X]
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_RI_zero_1
bne .Lzscal_kernel_RI_zero_1

mov w0, wzr
ret


+ 130
- 130
kernel/arm64/ztrmm_kernel_4x4.S View File

@@ -1078,9 +1078,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
ble ztrmm_kernel_L2_BEGIN
ble .Lztrmm_kernel_L2_BEGIN

ztrmm_kernel_L4_BEGIN:
.Lztrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
@@ -1094,15 +1094,15 @@ ztrmm_kernel_L4_BEGIN:
#endif
mov pA, origPA // pA = start of A array

ztrmm_kernel_L4_M4_BEGIN:
.Lztrmm_kernel_L4_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble ztrmm_kernel_L4_M2_BEGIN
ble .Lztrmm_kernel_L4_M2_BEGIN

.align 5
ztrmm_kernel_L4_M4_20:
.Lztrmm_kernel_L4_M4_20:

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
@@ -1123,7 +1123,7 @@ ztrmm_kernel_L4_M4_20:

asr counterL , tempK, #3
cmp counterL , #2
blt ztrmm_kernel_L4_M4_32
blt .Lztrmm_kernel_L4_M4_32

KERNEL4x4_I
KERNEL4x4_M2
@@ -1135,10 +1135,10 @@ ztrmm_kernel_L4_M4_20:
KERNEL4x4_M2

subs counterL, counterL, #2
ble ztrmm_kernel_L4_M4_22a
ble .Lztrmm_kernel_L4_M4_22a

.align 5
ztrmm_kernel_L4_M4_22:
.Lztrmm_kernel_L4_M4_22:

KERNEL4x4_M1
KERNEL4x4_M2
@@ -1150,10 +1150,10 @@ ztrmm_kernel_L4_M4_22:
KERNEL4x4_M2

subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M4_22
bgt .Lztrmm_kernel_L4_M4_22

.align 5
ztrmm_kernel_L4_M4_22a:
.Lztrmm_kernel_L4_M4_22a:

KERNEL4x4_M1
KERNEL4x4_M2
@@ -1164,13 +1164,13 @@ ztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E

b ztrmm_kernel_L4_M4_44
b .Lztrmm_kernel_L4_M4_44

.align 5
ztrmm_kernel_L4_M4_32:
.Lztrmm_kernel_L4_M4_32:

tst counterL, #1
ble ztrmm_kernel_L4_M4_40
ble .Lztrmm_kernel_L4_M4_40

KERNEL4x4_I
KERNEL4x4_M2
@@ -1181,26 +1181,26 @@ ztrmm_kernel_L4_M4_32:
KERNEL4x4_M1
KERNEL4x4_E

b ztrmm_kernel_L4_M4_44
b .Lztrmm_kernel_L4_M4_44


ztrmm_kernel_L4_M4_40:
.Lztrmm_kernel_L4_M4_40:

INIT4x4

ztrmm_kernel_L4_M4_44:
.Lztrmm_kernel_L4_M4_44:

ands counterL , tempK, #7
ble ztrmm_kernel_L4_M4_100
ble .Lztrmm_kernel_L4_M4_100

.align 5
ztrmm_kernel_L4_M4_46:
.Lztrmm_kernel_L4_M4_46:
KERNEL4x4_SUB

subs counterL, counterL, #1
bne ztrmm_kernel_L4_M4_46
bne .Lztrmm_kernel_L4_M4_46

ztrmm_kernel_L4_M4_100:
.Lztrmm_kernel_L4_M4_100:

SAVE4x4

@@ -1223,20 +1223,20 @@ ztrmm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]

ztrmm_kernel_L4_M4_END:
.Lztrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
bne ztrmm_kernel_L4_M4_20
bne .Lztrmm_kernel_L4_M4_20

ztrmm_kernel_L4_M2_BEGIN:
.Lztrmm_kernel_L4_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble ztrmm_kernel_L4_END
ble .Lztrmm_kernel_L4_END

tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L4_M1_BEGIN
ble .Lztrmm_kernel_L4_M1_BEGIN

ztrmm_kernel_L4_M2_20:
.Lztrmm_kernel_L4_M2_20:

INIT2x4

@@ -1260,9 +1260,9 @@ ztrmm_kernel_L4_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L4_M2_40
ble .Lztrmm_kernel_L4_M2_40

ztrmm_kernel_L4_M2_22:
.Lztrmm_kernel_L4_M2_22:

KERNEL2x4_SUB
KERNEL2x4_SUB
@@ -1275,22 +1275,22 @@ ztrmm_kernel_L4_M2_22:
KERNEL2x4_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M2_22
bgt .Lztrmm_kernel_L4_M2_22


ztrmm_kernel_L4_M2_40:
.Lztrmm_kernel_L4_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L4_M2_100
ble .Lztrmm_kernel_L4_M2_100

ztrmm_kernel_L4_M2_42:
.Lztrmm_kernel_L4_M2_42:

KERNEL2x4_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M2_42
bgt .Lztrmm_kernel_L4_M2_42

ztrmm_kernel_L4_M2_100:
.Lztrmm_kernel_L4_M2_100:

SAVE2x4

@@ -1310,15 +1310,15 @@ ztrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2
#endif

ztrmm_kernel_L4_M2_END:
.Lztrmm_kernel_L4_M2_END:


ztrmm_kernel_L4_M1_BEGIN:
.Lztrmm_kernel_L4_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L4_END
ble .Lztrmm_kernel_L4_END

ztrmm_kernel_L4_M1_20:
.Lztrmm_kernel_L4_M1_20:

INIT1x4

@@ -1342,9 +1342,9 @@ ztrmm_kernel_L4_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L4_M1_40
ble .Lztrmm_kernel_L4_M1_40

ztrmm_kernel_L4_M1_22:
.Lztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
@@ -1356,22 +1356,22 @@ ztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M1_22
bgt .Lztrmm_kernel_L4_M1_22


ztrmm_kernel_L4_M1_40:
.Lztrmm_kernel_L4_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L4_M1_100
ble .Lztrmm_kernel_L4_M1_100

ztrmm_kernel_L4_M1_42:
.Lztrmm_kernel_L4_M1_42:

KERNEL1x4_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M1_42
bgt .Lztrmm_kernel_L4_M1_42

ztrmm_kernel_L4_M1_100:
.Lztrmm_kernel_L4_M1_100:

SAVE1x4

@@ -1392,7 +1392,7 @@ ztrmm_kernel_L4_M1_100:
#endif


ztrmm_kernel_L4_END:
.Lztrmm_kernel_L4_END:

lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2
@@ -1402,19 +1402,19 @@ ztrmm_kernel_L4_END:
#endif

subs counterJ, counterJ , #1 // j--
bgt ztrmm_kernel_L4_BEGIN
bgt .Lztrmm_kernel_L4_BEGIN


/******************************************************************************/

ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction

mov counterJ , origN
tst counterJ , #3
ble ztrmm_kernel_L999 // error, N was less than 4?
ble .Lztrmm_kernel_L999 // error, N was less than 4?

tst counterJ , #2
ble ztrmm_kernel_L1_BEGIN
ble .Lztrmm_kernel_L1_BEGIN

mov pCRow0, pC // pCRow0 = pC

@@ -1426,14 +1426,14 @@ ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction

mov pA, origPA // pA = A

ztrmm_kernel_L2_M4_BEGIN:
.Lztrmm_kernel_L2_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
ble ztrmm_kernel_L2_M2_BEGIN
ble .Lztrmm_kernel_L2_M2_BEGIN

ztrmm_kernel_L2_M4_20:
.Lztrmm_kernel_L2_M4_20:

INIT4x2

@@ -1457,10 +1457,10 @@ ztrmm_kernel_L2_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ztrmm_kernel_L2_M4_40
ble .Lztrmm_kernel_L2_M4_40
.align 5

ztrmm_kernel_L2_M4_22:
.Lztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
@@ -1472,22 +1472,22 @@ ztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M4_22
bgt .Lztrmm_kernel_L2_M4_22


ztrmm_kernel_L2_M4_40:
.Lztrmm_kernel_L2_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M4_100
ble .Lztrmm_kernel_L2_M4_100

ztrmm_kernel_L2_M4_42:
.Lztrmm_kernel_L2_M4_42:

KERNEL4x2_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M4_42
bgt .Lztrmm_kernel_L2_M4_42

ztrmm_kernel_L2_M4_100:
.Lztrmm_kernel_L2_M4_100:

SAVE4x2

@@ -1507,22 +1507,22 @@ ztrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4
#endif

ztrmm_kernel_L2_M4_END:
.Lztrmm_kernel_L2_M4_END:

subs counterI, counterI, #1
bgt ztrmm_kernel_L2_M4_20
bgt .Lztrmm_kernel_L2_M4_20


ztrmm_kernel_L2_M2_BEGIN:
.Lztrmm_kernel_L2_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble ztrmm_kernel_L2_END
ble .Lztrmm_kernel_L2_END

tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L2_M1_BEGIN
ble .Lztrmm_kernel_L2_M1_BEGIN

ztrmm_kernel_L2_M2_20:
.Lztrmm_kernel_L2_M2_20:

INIT2x2

@@ -1546,9 +1546,9 @@ ztrmm_kernel_L2_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
ble ztrmm_kernel_L2_M2_40
ble .Lztrmm_kernel_L2_M2_40

ztrmm_kernel_L2_M2_22:
.Lztrmm_kernel_L2_M2_22:

KERNEL2x2_SUB
KERNEL2x2_SUB
@@ -1561,22 +1561,22 @@ ztrmm_kernel_L2_M2_22:
KERNEL2x2_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M2_22
bgt .Lztrmm_kernel_L2_M2_22


ztrmm_kernel_L2_M2_40:
.Lztrmm_kernel_L2_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M2_100
ble .Lztrmm_kernel_L2_M2_100

ztrmm_kernel_L2_M2_42:
.Lztrmm_kernel_L2_M2_42:

KERNEL2x2_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M2_42
bgt .Lztrmm_kernel_L2_M2_42

ztrmm_kernel_L2_M2_100:
.Lztrmm_kernel_L2_M2_100:

SAVE2x2

@@ -1596,15 +1596,15 @@ ztrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2
#endif

ztrmm_kernel_L2_M2_END:
.Lztrmm_kernel_L2_M2_END:


ztrmm_kernel_L2_M1_BEGIN:
.Lztrmm_kernel_L2_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L2_END
ble .Lztrmm_kernel_L2_END

ztrmm_kernel_L2_M1_20:
.Lztrmm_kernel_L2_M1_20:

INIT1x2

@@ -1628,9 +1628,9 @@ ztrmm_kernel_L2_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
ble ztrmm_kernel_L2_M1_40
ble .Lztrmm_kernel_L2_M1_40

ztrmm_kernel_L2_M1_22:
.Lztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
@@ -1642,22 +1642,22 @@ ztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M1_22
bgt .Lztrmm_kernel_L2_M1_22


ztrmm_kernel_L2_M1_40:
.Lztrmm_kernel_L2_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M1_100
ble .Lztrmm_kernel_L2_M1_100

ztrmm_kernel_L2_M1_42:
.Lztrmm_kernel_L2_M1_42:

KERNEL1x2_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M1_42
bgt .Lztrmm_kernel_L2_M1_42

ztrmm_kernel_L2_M1_100:
.Lztrmm_kernel_L2_M1_100:

SAVE1x2

@@ -1678,7 +1678,7 @@ ztrmm_kernel_L2_M1_100:
#endif


ztrmm_kernel_L2_END:
.Lztrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
@@ -1688,11 +1688,11 @@ ztrmm_kernel_L2_END:

/******************************************************************************/

ztrmm_kernel_L1_BEGIN:
.Lztrmm_kernel_L1_BEGIN:

mov counterJ , origN
tst counterJ , #1
ble ztrmm_kernel_L999 // done
ble .Lztrmm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C
@@ -1706,14 +1706,14 @@ ztrmm_kernel_L1_BEGIN:



ztrmm_kernel_L1_M4_BEGIN:
.Lztrmm_kernel_L1_M4_BEGIN:

mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
ble ztrmm_kernel_L1_M2_BEGIN
ble .Lztrmm_kernel_L1_M2_BEGIN

ztrmm_kernel_L1_M4_20:
.Lztrmm_kernel_L1_M4_20:

INIT4x1

@@ -1737,10 +1737,10 @@ ztrmm_kernel_L1_M4_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L1_M4_40
ble .Lztrmm_kernel_L1_M4_40
.align 5

ztrmm_kernel_L1_M4_22:
.Lztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
@@ -1752,22 +1752,22 @@ ztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M4_22
bgt .Lztrmm_kernel_L1_M4_22


ztrmm_kernel_L1_M4_40:
.Lztrmm_kernel_L1_M4_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M4_100
ble .Lztrmm_kernel_L1_M4_100

ztrmm_kernel_L1_M4_42:
.Lztrmm_kernel_L1_M4_42:

KERNEL4x1_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M4_42
bgt .Lztrmm_kernel_L1_M4_42

ztrmm_kernel_L1_M4_100:
.Lztrmm_kernel_L1_M4_100:

SAVE4x1

@@ -1787,22 +1787,22 @@ ztrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4
#endif

ztrmm_kernel_L1_M4_END:
.Lztrmm_kernel_L1_M4_END:

subs counterI, counterI, #1
bgt ztrmm_kernel_L1_M4_20
bgt .Lztrmm_kernel_L1_M4_20


ztrmm_kernel_L1_M2_BEGIN:
.Lztrmm_kernel_L1_M2_BEGIN:

mov counterI, origM
tst counterI , #3
ble ztrmm_kernel_L1_END
ble .Lztrmm_kernel_L1_END

tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L1_M1_BEGIN
ble .Lztrmm_kernel_L1_M1_BEGIN

ztrmm_kernel_L1_M2_20:
.Lztrmm_kernel_L1_M2_20:

INIT2x1

@@ -1826,9 +1826,9 @@ ztrmm_kernel_L1_M2_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L1_M2_40
ble .Lztrmm_kernel_L1_M2_40

ztrmm_kernel_L1_M2_22:
.Lztrmm_kernel_L1_M2_22:

KERNEL2x1_SUB
KERNEL2x1_SUB
@@ -1841,22 +1841,22 @@ ztrmm_kernel_L1_M2_22:
KERNEL2x1_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M2_22
bgt .Lztrmm_kernel_L1_M2_22


ztrmm_kernel_L1_M2_40:
.Lztrmm_kernel_L1_M2_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M2_100
ble .Lztrmm_kernel_L1_M2_100

ztrmm_kernel_L1_M2_42:
.Lztrmm_kernel_L1_M2_42:

KERNEL2x1_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M2_42
bgt .Lztrmm_kernel_L1_M2_42

ztrmm_kernel_L1_M2_100:
.Lztrmm_kernel_L1_M2_100:

SAVE2x1

@@ -1876,15 +1876,15 @@ ztrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2
#endif

ztrmm_kernel_L1_M2_END:
.Lztrmm_kernel_L1_M2_END:


ztrmm_kernel_L1_M1_BEGIN:
.Lztrmm_kernel_L1_M1_BEGIN:

tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L1_END
ble .Lztrmm_kernel_L1_END

ztrmm_kernel_L1_M1_20:
.Lztrmm_kernel_L1_M1_20:

INIT1x1

@@ -1908,9 +1908,9 @@ ztrmm_kernel_L1_M1_20:

asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
ble ztrmm_kernel_L1_M1_40
ble .Lztrmm_kernel_L1_M1_40

ztrmm_kernel_L1_M1_22:
.Lztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
@@ -1922,30 +1922,30 @@ ztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M1_22
bgt .Lztrmm_kernel_L1_M1_22


ztrmm_kernel_L1_M1_40:
.Lztrmm_kernel_L1_M1_40:

ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M1_100
ble .Lztrmm_kernel_L1_M1_100

ztrmm_kernel_L1_M1_42:
.Lztrmm_kernel_L1_M1_42:

KERNEL1x1_SUB

subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M1_42
bgt .Lztrmm_kernel_L1_M1_42

ztrmm_kernel_L1_M1_100:
.Lztrmm_kernel_L1_M1_100:

SAVE1x1


ztrmm_kernel_L1_END:
.Lztrmm_kernel_L1_END:


ztrmm_kernel_L999:
.Lztrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]


Loading…
Cancel
Save