Browse Source

ARM64: Convert all labels to local labels

While debugging/profiling applications using perf or other tools, the
kernels appear scattered in the profile reports. This is because the labels
within the kernels are not local and each label is shown as a separate
function.

To avoid this, all the labels within the kernels are changed to local
labels.
tags/v0.3.0
Ashwin Sekhar T K 8 years ago
parent
commit
a0128aa489
50 changed files with 4469 additions and 4469 deletions
  1. +25
    -25
      kernel/arm64/amax.S
  2. +20
    -20
      kernel/arm64/asum.S
  3. +21
    -21
      kernel/arm64/axpy.S
  4. +20
    -20
      kernel/arm64/casum.S
  5. +142
    -142
      kernel/arm64/cgemm_kernel_4x4.S
  6. +175
    -175
      kernel/arm64/cgemm_kernel_8x4.S
  7. +175
    -175
      kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
  8. +20
    -20
      kernel/arm64/copy.S
  9. +129
    -129
      kernel/arm64/ctrmm_kernel_4x4.S
  10. +175
    -175
      kernel/arm64/ctrmm_kernel_8x4.S
  11. +22
    -22
      kernel/arm64/daxpy_thunderx2t99.S
  12. +143
    -143
      kernel/arm64/dgemm_kernel_4x4.S
  13. +176
    -176
      kernel/arm64/dgemm_kernel_4x8.S
  14. +169
    -169
      kernel/arm64/dgemm_kernel_8x4.S
  15. +169
    -169
      kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S
  16. +36
    -36
      kernel/arm64/dgemm_ncopy_4.S
  17. +48
    -48
      kernel/arm64/dgemm_ncopy_8.S
  18. +36
    -36
      kernel/arm64/dgemm_tcopy_4.S
  19. +56
    -56
      kernel/arm64/dgemm_tcopy_8.S
  20. +20
    -20
      kernel/arm64/dot.S
  21. +129
    -129
      kernel/arm64/dtrmm_kernel_4x4.S
  22. +176
    -176
      kernel/arm64/dtrmm_kernel_4x8.S
  23. +169
    -169
      kernel/arm64/dtrmm_kernel_8x4.S
  24. +31
    -31
      kernel/arm64/gemv_n.S
  25. +31
    -31
      kernel/arm64/gemv_t.S
  26. +24
    -24
      kernel/arm64/iamax.S
  27. +24
    -24
      kernel/arm64/izamax.S
  28. +16
    -16
      kernel/arm64/nrm2.S
  29. +20
    -20
      kernel/arm64/rot.S
  30. +23
    -23
      kernel/arm64/scal.S
  31. +221
    -221
      kernel/arm64/sgemm_kernel_16x4.S
  32. +221
    -221
      kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S
  33. +155
    -155
      kernel/arm64/sgemm_kernel_4x4.S
  34. +241
    -241
      kernel/arm64/sgemm_kernel_8x8.S
  35. +221
    -221
      kernel/arm64/strmm_kernel_16x4.S
  36. +130
    -130
      kernel/arm64/strmm_kernel_4x4.S
  37. +241
    -241
      kernel/arm64/strmm_kernel_8x8.S
  38. +21
    -21
      kernel/arm64/swap.S
  39. +25
    -25
      kernel/arm64/zamax.S
  40. +20
    -20
      kernel/arm64/zasum.S
  41. +21
    -21
      kernel/arm64/zaxpy.S
  42. +20
    -20
      kernel/arm64/zdot.S
  43. +130
    -130
      kernel/arm64/zgemm_kernel_4x4.S
  44. +130
    -130
      kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S
  45. +26
    -26
      kernel/arm64/zgemv_n.S
  46. +26
    -26
      kernel/arm64/zgemv_t.S
  47. +16
    -16
      kernel/arm64/znrm2.S
  48. +20
    -20
      kernel/arm64/zrot.S
  49. +34
    -34
      kernel/arm64/zscal.S
  50. +130
    -130
      kernel/arm64/ztrmm_kernel_4x4.S

+ 25
- 25
kernel/arm64/amax.S View File

@@ -160,62 +160,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble amax_kernel_zero
ble .Lamax_kernel_zero
cmp INC_X, xzr cmp INC_X, xzr
ble amax_kernel_zero
ble .Lamax_kernel_zero


cmp INC_X, #1 cmp INC_X, #1
bne amax_kernel_S_BEGIN
bne .Lamax_kernel_S_BEGIN


amax_kernel_F_BEGIN:
.Lamax_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq amax_kernel_F1_INIT
beq .Lamax_kernel_F1_INIT


INIT_F4 INIT_F4
subs I, I, #1 subs I, I, #1
beq amax_kernel_F1
beq .Lamax_kernel_F1


amax_kernel_F4:
.Lamax_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne amax_kernel_F4
bne .Lamax_kernel_F4


amax_kernel_F1:
.Lamax_kernel_F1:


ands I, N, #3 ands I, N, #3
ble amax_kernel_L999
ble .Lamax_kernel_L999


amax_kernel_F10:
.Lamax_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne amax_kernel_F10
bne .Lamax_kernel_F10


ret ret


amax_kernel_F1_INIT:
.Lamax_kernel_F1_INIT:


INIT_F1 INIT_F1
subs N, N, #1 subs N, N, #1
b amax_kernel_F1
b .Lamax_kernel_F1


amax_kernel_S_BEGIN:
.Lamax_kernel_S_BEGIN:


INIT_S INIT_S


subs N, N, #1 subs N, N, #1
ble amax_kernel_L999
ble .Lamax_kernel_L999


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble amax_kernel_S1
ble .Lamax_kernel_S1


amax_kernel_S4:
.Lamax_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -223,25 +223,25 @@ amax_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne amax_kernel_S4
bne .Lamax_kernel_S4


amax_kernel_S1:
.Lamax_kernel_S1:


ands I, N, #3 ands I, N, #3
ble amax_kernel_L999
ble .Lamax_kernel_L999


amax_kernel_S10:
.Lamax_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne amax_kernel_S10
bne .Lamax_kernel_S10


amax_kernel_L999:
.Lamax_kernel_L999:


ret ret


amax_kernel_zero:
.Lamax_kernel_zero:


fmov MAXF, REG0 fmov MAXF, REG0
ret ret


+ 20
- 20
kernel/arm64/asum.S View File

@@ -122,52 +122,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif


cmp N, xzr cmp N, xzr
ble asum_kernel_L999
ble .Lasum_kernel_L999
cmp INC_X, xzr cmp INC_X, xzr
ble asum_kernel_L999
ble .Lasum_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lasum_kernel_S_BEGIN


asum_kernel_F_BEGIN:
.Lasum_kernel_F_BEGIN:


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq asum_kernel_F1
beq .Lasum_kernel_F1


asum_kernel_F8:
.Lasum_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne asum_kernel_F8
bne .Lasum_kernel_F8


KERNEL_F8_FINALIZE KERNEL_F8_FINALIZE


asum_kernel_F1:
.Lasum_kernel_F1:


ands I, N, #7 ands I, N, #7
ble asum_kernel_L999
ble .Lasum_kernel_L999


asum_kernel_F10:
.Lasum_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne asum_kernel_F10
bne .Lasum_kernel_F10


asum_kernel_L999:
.Lasum_kernel_L999:
ret ret


asum_kernel_S_BEGIN:
.Lasum_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble asum_kernel_S1
ble .Lasum_kernel_S1


asum_kernel_S4:
.Lasum_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -175,19 +175,19 @@ asum_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne asum_kernel_S4
bne .Lasum_kernel_S4


asum_kernel_S1:
.Lasum_kernel_S1:


ands I, N, #3 ands I, N, #3
ble asum_kernel_L999
ble .Lasum_kernel_L999


asum_kernel_S10:
.Lasum_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne asum_kernel_S10
bne .Lasum_kernel_S10


ret ret




+ 21
- 21
kernel/arm64/axpy.S View File

@@ -135,53 +135,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble axpy_kernel_L999
ble .Laxpy_kernel_L999


fcmp DA, #0.0 fcmp DA, #0.0
beq axpy_kernel_L999
beq .Laxpy_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne axpy_kernel_S_BEGIN
bne .Laxpy_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne axpy_kernel_S_BEGIN
bne .Laxpy_kernel_S_BEGIN


axpy_kernel_F_BEGIN:
.Laxpy_kernel_F_BEGIN:


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq axpy_kernel_F1
beq .Laxpy_kernel_F1


axpy_kernel_F8:
.Laxpy_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne axpy_kernel_F8
bne .Laxpy_kernel_F8


axpy_kernel_F1:
.Laxpy_kernel_F1:


ands I, N, #7 ands I, N, #7
ble axpy_kernel_L999
ble .Laxpy_kernel_L999


axpy_kernel_F10:
.Laxpy_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne axpy_kernel_F10
bne .Laxpy_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


axpy_kernel_S_BEGIN:
.Laxpy_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble axpy_kernel_S1
ble .Laxpy_kernel_S1


axpy_kernel_S4:
.Laxpy_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -189,21 +189,21 @@ axpy_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne axpy_kernel_S4
bne .Laxpy_kernel_S4


axpy_kernel_S1:
.Laxpy_kernel_S1:


ands I, N, #3 ands I, N, #3
ble axpy_kernel_L999
ble .Laxpy_kernel_L999


axpy_kernel_S10:
.Laxpy_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne axpy_kernel_S10
bne .Laxpy_kernel_S10


axpy_kernel_L999:
.Laxpy_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret

+ 20
- 20
kernel/arm64/casum.S View File

@@ -98,52 +98,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov s1, SUMF fmov s1, SUMF


cmp N, xzr cmp N, xzr
ble asum_kernel_L999
ble .Lcasum_kernel_L999
cmp INC_X, xzr cmp INC_X, xzr
ble asum_kernel_L999
ble .Lcasum_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lcasum_kernel_S_BEGIN


asum_kernel_F_BEGIN:
.Lcasum_kernel_F_BEGIN:


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq asum_kernel_F1
beq .Lcasum_kernel_F1


asum_kernel_F8:
.Lcasum_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne asum_kernel_F8
bne .Lcasum_kernel_F8


KERNEL_F8_FINALIZE KERNEL_F8_FINALIZE


asum_kernel_F1:
.Lcasum_kernel_F1:


ands I, N, #7 ands I, N, #7
ble asum_kernel_L999
ble .Lcasum_kernel_L999


asum_kernel_F10:
.Lcasum_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne asum_kernel_F10
bne .Lcasum_kernel_F10


asum_kernel_L999:
.Lcasum_kernel_L999:
ret ret


asum_kernel_S_BEGIN:
.Lcasum_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble asum_kernel_S1
ble .Lcasum_kernel_S1


asum_kernel_S4:
.Lcasum_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -151,19 +151,19 @@ asum_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne asum_kernel_S4
bne .Lcasum_kernel_S4


asum_kernel_S1:
.Lcasum_kernel_S1:


ands I, N, #3 ands I, N, #3
ble asum_kernel_L999
ble .Lcasum_kernel_L999


asum_kernel_S10:
.Lcasum_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne asum_kernel_S10
bne .Lcasum_kernel_S10


ret ret




+ 142
- 142
kernel/arm64/cgemm_kernel_4x4.S View File

@@ -1072,11 +1072,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2


@@ -1084,96 +1084,96 @@ cgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
add ppA, temp, pA add ppA, temp, pA


cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN


cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32


KERNEL8x4_I // do one in the K KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K KERNEL8x4_M2 // do another in the K


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a
.align 5 .align 5


cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22




cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44


cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_E KERNEL8x4_E


b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44




cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:


INIT8x4 INIT8x4


cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:


ands counterL , origK, #1 ands counterL , origK, #1
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100


cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB


cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:


SAVE8x4 SAVE8x4


cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8 lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp add pA, pA, temp
add ppA, ppA, temp add ppA, ppA, temp
subs counterI, counterI, #1 subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20




cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN


cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:


INIT4x4 INIT4x4


mov pB, origPB mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8 asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40


cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:


KERNEL4x4_SUB KERNEL4x4_SUB
KERNEL4x4_SUB KERNEL4x4_SUB
@@ -1186,47 +1186,47 @@ cgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22




cgemm_kernel_L4_M4_40:
.Lcgemm_kernel_L4_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100


cgemm_kernel_L4_M4_42:
.Lcgemm_kernel_L4_M4_42:


KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_42
bgt .Lcgemm_kernel_L4_M4_42


cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:




cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN


cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40


cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1239,43 +1239,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22




cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100


cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42


cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:




cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40


cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1287,45 +1287,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22




cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100


cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42


cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1335,24 +1335,24 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction






cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN


cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5 .align 5


cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1364,50 +1364,50 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22




cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100


cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42


cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L2_M4_20
bgt .Lcgemm_kernel_L2_M4_20




cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN


cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40


cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1420,43 +1420,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22




cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100


cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42


cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:




cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40


cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1468,36 +1468,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22




cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100


cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42


cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1507,24 +1507,24 @@ cgemm_kernel_L1_BEGIN:






cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN


cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5 .align 5


cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1536,50 +1536,50 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22




cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100


cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42


cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L1_M4_20
bgt .Lcgemm_kernel_L1_M4_20




cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN


cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40


cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1592,43 +1592,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22




cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100


cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42


cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:




cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40


cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1640,30 +1640,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22




cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100


cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42


cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:




cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 175
- 175
kernel/arm64/cgemm_kernel_8x4.S View File

@@ -1407,11 +1407,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -1421,21 +1421,21 @@ cgemm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN


.align 5 .align 5
cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:


mov pB, origPB mov pB, origPB


asr counterL , origK, #3 asr counterL , origK, #3
cmp counterL , #2 cmp counterL , #2
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1447,10 +1447,10 @@ cgemm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a


.align 5 .align 5
cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1462,10 +1462,10 @@ cgemm_kernel_L4_M8_22:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22


.align 5 .align 5
cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1476,13 +1476,13 @@ cgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44


.align 5 .align 5
cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1493,116 +1493,116 @@ cgemm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44


cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:


INIT8x4 INIT8x4


cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:


ands counterL , origK, #7 ands counterL , origK, #7
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100


.align 5 .align 5
cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne cgemm_kernel_L4_M8_46
bne .Lcgemm_kernel_L4_M8_46


cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


SAVE8x4 SAVE8x4


cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20


cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN




cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M4_32
blt .Lcgemm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble cgemm_kernel_L4_M4_22a
ble .Lcgemm_kernel_L4_M4_22a
.align 5 .align 5




cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22


cgemm_kernel_L4_M4_22a:
.Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_32:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_40:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_40:


INIT4x4 INIT4x4


cgemm_kernel_L4_M4_44:
.Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1 ands counterL , origK, #1
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100


cgemm_kernel_L4_M4_46:
.Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:


cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN


cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40


cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1615,43 +1615,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22




cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100


cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42


cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:




cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40


cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1663,45 +1663,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22




cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100


cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42


cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1710,14 +1710,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A




cgemm_kernel_L2_M8_BEGIN:
.Lcgemm_kernel_L2_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L2_M4_BEGIN
ble .Lcgemm_kernel_L2_M4_BEGIN


cgemm_kernel_L2_M8_20:
.Lcgemm_kernel_L2_M8_20:


INIT8x2 INIT8x2


@@ -1725,10 +1725,10 @@ cgemm_kernel_L2_M8_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M8_40
ble .Lcgemm_kernel_L2_M8_40
.align 5 .align 5


cgemm_kernel_L2_M8_22:
.Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
@@ -1740,50 +1740,50 @@ cgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_22
bgt .Lcgemm_kernel_L2_M8_22




cgemm_kernel_L2_M8_40:
.Lcgemm_kernel_L2_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M8_100
ble .Lcgemm_kernel_L2_M8_100


cgemm_kernel_L2_M8_42:
.Lcgemm_kernel_L2_M8_42:


KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_42
bgt .Lcgemm_kernel_L2_M8_42


cgemm_kernel_L2_M8_100:
.Lcgemm_kernel_L2_M8_100:


SAVE8x2 SAVE8x2


cgemm_kernel_L2_M8_END:
.Lcgemm_kernel_L2_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L2_M8_20
bgt .Lcgemm_kernel_L2_M8_20


cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN


cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5 .align 5


cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1795,46 +1795,46 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22




cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100


cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42


cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:


cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN


cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40


cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1847,43 +1847,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22




cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100


cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42


cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:




cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40


cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1895,36 +1895,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22




cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100


cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42


cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1933,24 +1933,24 @@ cgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A mov pA, origPA // pA = A




cgemm_kernel_L1_M8_BEGIN:
.Lcgemm_kernel_L1_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L1_M4_BEGIN
ble .Lcgemm_kernel_L1_M4_BEGIN


cgemm_kernel_L1_M8_20:
.Lcgemm_kernel_L1_M8_20:


INIT8x1 INIT8x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M8_40
ble .Lcgemm_kernel_L1_M8_40
.align 5 .align 5


cgemm_kernel_L1_M8_22:
.Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@@ -1962,51 +1962,51 @@ cgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_22
bgt .Lcgemm_kernel_L1_M8_22




cgemm_kernel_L1_M8_40:
.Lcgemm_kernel_L1_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M8_100
ble .Lcgemm_kernel_L1_M8_100


cgemm_kernel_L1_M8_42:
.Lcgemm_kernel_L1_M8_42:


KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_42
bgt .Lcgemm_kernel_L1_M8_42


cgemm_kernel_L1_M8_100:
.Lcgemm_kernel_L1_M8_100:


SAVE8x1 SAVE8x1


cgemm_kernel_L1_M8_END:
.Lcgemm_kernel_L1_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L1_M8_20
bgt .Lcgemm_kernel_L1_M8_20


cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN




cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5 .align 5


cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -2018,47 +2018,47 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22




cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100


cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42


cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:




cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN


cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40


cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -2071,43 +2071,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22




cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100


cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42


cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:




cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40


cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -2119,30 +2119,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22




cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100


cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42


cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:




cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 175
- 175
kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S View File

@@ -1432,11 +1432,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -1446,21 +1446,21 @@ cgemm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN


.align 5 .align 5
cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:


mov pB, origPB mov pB, origPB


asr counterL , origK, #5 // origK / 32 asr counterL , origK, #5 // origK / 32
cmp counterL , #2 cmp counterL , #2
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1470,18 +1470,18 @@ cgemm_kernel_L4_M8_20:
KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x8


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a


.align 5 .align 5
cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:


KERNEL8x4_M1_M2_x16 KERNEL8x4_M1_M2_x16


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22


.align 5 .align 5
cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:


KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x8
KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x4
@@ -1490,13 +1490,13 @@ cgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44


.align 5 .align 5
cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1506,116 +1506,116 @@ cgemm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44


cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:


INIT8x4 INIT8x4


cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:


ands counterL , origK, #31 ands counterL , origK, #31
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100


.align 5 .align 5
cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne cgemm_kernel_L4_M8_46
bne .Lcgemm_kernel_L4_M8_46


cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


SAVE8x4 SAVE8x4


cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20


cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN




cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M4_32
blt .Lcgemm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble cgemm_kernel_L4_M4_22a
ble .Lcgemm_kernel_L4_M4_22a
.align 5 .align 5




cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22


cgemm_kernel_L4_M4_22a:
.Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_32:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_40:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_40:


INIT4x4 INIT4x4


cgemm_kernel_L4_M4_44:
.Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1 ands counterL , origK, #1
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100


cgemm_kernel_L4_M4_46:
.Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:


cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN


cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40


cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1628,43 +1628,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22




cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100


cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42


cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:




cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40


cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1676,45 +1676,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22




cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100


cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42


cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1723,14 +1723,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A




cgemm_kernel_L2_M8_BEGIN:
.Lcgemm_kernel_L2_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L2_M4_BEGIN
ble .Lcgemm_kernel_L2_M4_BEGIN


cgemm_kernel_L2_M8_20:
.Lcgemm_kernel_L2_M8_20:


INIT8x2 INIT8x2


@@ -1738,10 +1738,10 @@ cgemm_kernel_L2_M8_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M8_40
ble .Lcgemm_kernel_L2_M8_40
.align 5 .align 5


cgemm_kernel_L2_M8_22:
.Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
@@ -1753,50 +1753,50 @@ cgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_22
bgt .Lcgemm_kernel_L2_M8_22




cgemm_kernel_L2_M8_40:
.Lcgemm_kernel_L2_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M8_100
ble .Lcgemm_kernel_L2_M8_100


cgemm_kernel_L2_M8_42:
.Lcgemm_kernel_L2_M8_42:


KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_42
bgt .Lcgemm_kernel_L2_M8_42


cgemm_kernel_L2_M8_100:
.Lcgemm_kernel_L2_M8_100:


SAVE8x2 SAVE8x2


cgemm_kernel_L2_M8_END:
.Lcgemm_kernel_L2_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L2_M8_20
bgt .Lcgemm_kernel_L2_M8_20


cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN


cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5 .align 5


cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1808,46 +1808,46 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22




cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100


cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42


cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:


cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN


cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40


cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1860,43 +1860,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22




cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100


cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42


cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:




cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40


cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1908,36 +1908,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22




cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100


cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42


cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1946,24 +1946,24 @@ cgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A mov pA, origPA // pA = A




cgemm_kernel_L1_M8_BEGIN:
.Lcgemm_kernel_L1_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L1_M4_BEGIN
ble .Lcgemm_kernel_L1_M4_BEGIN


cgemm_kernel_L1_M8_20:
.Lcgemm_kernel_L1_M8_20:


INIT8x1 INIT8x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M8_40
ble .Lcgemm_kernel_L1_M8_40
.align 5 .align 5


cgemm_kernel_L1_M8_22:
.Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@@ -1975,51 +1975,51 @@ cgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_22
bgt .Lcgemm_kernel_L1_M8_22




cgemm_kernel_L1_M8_40:
.Lcgemm_kernel_L1_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M8_100
ble .Lcgemm_kernel_L1_M8_100


cgemm_kernel_L1_M8_42:
.Lcgemm_kernel_L1_M8_42:


KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_42
bgt .Lcgemm_kernel_L1_M8_42


cgemm_kernel_L1_M8_100:
.Lcgemm_kernel_L1_M8_100:


SAVE8x1 SAVE8x1


cgemm_kernel_L1_M8_END:
.Lcgemm_kernel_L1_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L1_M8_20
bgt .Lcgemm_kernel_L1_M8_20


cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN




cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5 .align 5


cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -2031,47 +2031,47 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22




cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100


cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42


cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:




cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN


cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40


cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -2084,43 +2084,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22




cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100


cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42


cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:




cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40


cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -2132,30 +2132,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22




cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100


cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42


cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:




cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 20
- 20
kernel/arm64/copy.S View File

@@ -159,50 +159,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble copy_kernel_L999
ble .Lcopy_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne copy_kernel_S_BEGIN
bne .Lcopy_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne copy_kernel_S_BEGIN
bne .Lcopy_kernel_S_BEGIN


copy_kernel_F_BEGIN:
.Lcopy_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq copy_kernel_F1
beq .Lcopy_kernel_F1


copy_kernel_F4:
.Lcopy_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne copy_kernel_F4
bne .Lcopy_kernel_F4


copy_kernel_F1:
.Lcopy_kernel_F1:


ands I, N, #3 ands I, N, #3
ble copy_kernel_L999
ble .Lcopy_kernel_L999


copy_kernel_F10:
.Lcopy_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne copy_kernel_F10
bne .Lcopy_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


copy_kernel_S_BEGIN:
.Lcopy_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble copy_kernel_S1
ble .Lcopy_kernel_S1


copy_kernel_S4:
.Lcopy_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -210,21 +210,21 @@ copy_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne copy_kernel_S4
bne .Lcopy_kernel_S4


copy_kernel_S1:
.Lcopy_kernel_S1:


ands I, N, #3 ands I, N, #3
ble copy_kernel_L999
ble .Lcopy_kernel_L999


copy_kernel_S10:
.Lcopy_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne copy_kernel_S10
bne .Lcopy_kernel_S10


copy_kernel_L999:
.Lcopy_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret


+ 129
- 129
kernel/arm64/ctrmm_kernel_4x4.S View File

@@ -785,11 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble ctrmm_kernel_L2_BEGIN
ble .Lctrmm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


ctrmm_kernel_L4_BEGIN:
.Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2


@@ -798,14 +798,14 @@ ctrmm_kernel_L4_BEGIN:
#endif #endif
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


ctrmm_kernel_L4_M4_BEGIN:
.Lctrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L4_M2_BEGIN
ble .Lctrmm_kernel_L4_M2_BEGIN


ctrmm_kernel_L4_M4_20:
.Lctrmm_kernel_L4_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -826,55 +826,55 @@ ctrmm_kernel_L4_M4_20:


asr counterL , tempK, #1 // L = K / 2 asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt ctrmm_kernel_L4_M4_32
blt .Lctrmm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble ctrmm_kernel_L4_M4_22a
ble .Lctrmm_kernel_L4_M4_22a
.align 5 .align 5


ctrmm_kernel_L4_M4_22:
.Lctrmm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22
bgt .Lctrmm_kernel_L4_M4_22




ctrmm_kernel_L4_M4_22a:
.Lctrmm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b ctrmm_kernel_L4_M4_44
b .Lctrmm_kernel_L4_M4_44


ctrmm_kernel_L4_M4_32:
.Lctrmm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble ctrmm_kernel_L4_M4_40
ble .Lctrmm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E


b ctrmm_kernel_L4_M4_44
b .Lctrmm_kernel_L4_M4_44




ctrmm_kernel_L4_M4_40:
.Lctrmm_kernel_L4_M4_40:


INIT4x4 INIT4x4


ctrmm_kernel_L4_M4_44:
.Lctrmm_kernel_L4_M4_44:


ands counterL , tempK, #1 ands counterL , tempK, #1
ble ctrmm_kernel_L4_M4_100
ble .Lctrmm_kernel_L4_M4_100


ctrmm_kernel_L4_M4_46:
.Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


ctrmm_kernel_L4_M4_100:
.Lctrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


@@ -893,20 +893,20 @@ ctrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ctrmm_kernel_L4_M4_END:
.Lctrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne ctrmm_kernel_L4_M4_20
bne .Lctrmm_kernel_L4_M4_20


ctrmm_kernel_L4_M2_BEGIN:
.Lctrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L4_M1_BEGIN
ble .Lctrmm_kernel_L4_M1_BEGIN


ctrmm_kernel_L4_M2_20:
.Lctrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -930,9 +930,9 @@ ctrmm_kernel_L4_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L4_M2_40
ble .Lctrmm_kernel_L4_M2_40


ctrmm_kernel_L4_M2_22:
.Lctrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -945,22 +945,22 @@ ctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_22
bgt .Lctrmm_kernel_L4_M2_22




ctrmm_kernel_L4_M2_40:
.Lctrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M2_100
ble .Lctrmm_kernel_L4_M2_100


ctrmm_kernel_L4_M2_42:
.Lctrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_42
bgt .Lctrmm_kernel_L4_M2_42


ctrmm_kernel_L4_M2_100:
.Lctrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -980,15 +980,15 @@ ctrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ctrmm_kernel_L4_M2_END:
.Lctrmm_kernel_L4_M2_END:




ctrmm_kernel_L4_M1_BEGIN:
.Lctrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END


ctrmm_kernel_L4_M1_20:
.Lctrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1012,9 +1012,9 @@ ctrmm_kernel_L4_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L4_M1_40
ble .Lctrmm_kernel_L4_M1_40


ctrmm_kernel_L4_M1_22:
.Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1026,22 +1026,22 @@ ctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_22
bgt .Lctrmm_kernel_L4_M1_22




ctrmm_kernel_L4_M1_40:
.Lctrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M1_100
ble .Lctrmm_kernel_L4_M1_100


ctrmm_kernel_L4_M1_42:
.Lctrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_42
bgt .Lctrmm_kernel_L4_M1_42


ctrmm_kernel_L4_M1_100:
.Lctrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -1061,7 +1061,7 @@ ctrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


ctrmm_kernel_L4_END:
.Lctrmm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@@ -1071,19 +1071,19 @@ ctrmm_kernel_L4_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt ctrmm_kernel_L4_BEGIN
bgt .Lctrmm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble ctrmm_kernel_L999 // error, N was less than 4?
ble .Lctrmm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble ctrmm_kernel_L1_BEGIN
ble .Lctrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1095,14 +1095,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov pA, origPA // pA = A mov pA, origPA // pA = A


ctrmm_kernel_L2_M4_BEGIN:
.Lctrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble ctrmm_kernel_L2_M2_BEGIN
ble .Lctrmm_kernel_L2_M2_BEGIN


ctrmm_kernel_L2_M4_20:
.Lctrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1126,10 +1126,10 @@ ctrmm_kernel_L2_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M4_40
ble .Lctrmm_kernel_L2_M4_40
.align 5 .align 5


ctrmm_kernel_L2_M4_22:
.Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1141,22 +1141,22 @@ ctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_22
bgt .Lctrmm_kernel_L2_M4_22




ctrmm_kernel_L2_M4_40:
.Lctrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M4_100
ble .Lctrmm_kernel_L2_M4_100


ctrmm_kernel_L2_M4_42:
.Lctrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_42
bgt .Lctrmm_kernel_L2_M4_42


ctrmm_kernel_L2_M4_100:
.Lctrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


@@ -1176,22 +1176,22 @@ ctrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ctrmm_kernel_L2_M4_END:
.Lctrmm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ctrmm_kernel_L2_M4_20
bgt .Lctrmm_kernel_L2_M4_20




ctrmm_kernel_L2_M2_BEGIN:
.Lctrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L2_M1_BEGIN
ble .Lctrmm_kernel_L2_M1_BEGIN


ctrmm_kernel_L2_M2_20:
.Lctrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1215,9 +1215,9 @@ ctrmm_kernel_L2_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M2_40
ble .Lctrmm_kernel_L2_M2_40


ctrmm_kernel_L2_M2_22:
.Lctrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1230,22 +1230,22 @@ ctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_22
bgt .Lctrmm_kernel_L2_M2_22




ctrmm_kernel_L2_M2_40:
.Lctrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M2_100
ble .Lctrmm_kernel_L2_M2_100


ctrmm_kernel_L2_M2_42:
.Lctrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_42
bgt .Lctrmm_kernel_L2_M2_42


ctrmm_kernel_L2_M2_100:
.Lctrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


@@ -1265,15 +1265,15 @@ ctrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ctrmm_kernel_L2_M2_END:
.Lctrmm_kernel_L2_M2_END:




ctrmm_kernel_L2_M1_BEGIN:
.Lctrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END


ctrmm_kernel_L2_M1_20:
.Lctrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1297,9 +1297,9 @@ ctrmm_kernel_L2_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble ctrmm_kernel_L2_M1_40
ble .Lctrmm_kernel_L2_M1_40


ctrmm_kernel_L2_M1_22:
.Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1311,22 +1311,22 @@ ctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_22
bgt .Lctrmm_kernel_L2_M1_22




ctrmm_kernel_L2_M1_40:
.Lctrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M1_100
ble .Lctrmm_kernel_L2_M1_100


ctrmm_kernel_L2_M1_42:
.Lctrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_42
bgt .Lctrmm_kernel_L2_M1_42


ctrmm_kernel_L2_M1_100:
.Lctrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -1346,7 +1346,7 @@ ctrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


ctrmm_kernel_L2_END:
.Lctrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -1354,11 +1354,11 @@ ctrmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


ctrmm_kernel_L1_BEGIN:
.Lctrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble ctrmm_kernel_L999 // done
ble .Lctrmm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1370,14 +1370,14 @@ ctrmm_kernel_L1_BEGIN:


mov pA, origPA // pA = A mov pA, origPA // pA = A


ctrmm_kernel_L1_M4_BEGIN:
.Lctrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L1_M2_BEGIN
ble .Lctrmm_kernel_L1_M2_BEGIN


ctrmm_kernel_L1_M4_20:
.Lctrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -1401,10 +1401,10 @@ ctrmm_kernel_L1_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M4_40
ble .Lctrmm_kernel_L1_M4_40
.align 5 .align 5


ctrmm_kernel_L1_M4_22:
.Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1416,22 +1416,22 @@ ctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_22
bgt .Lctrmm_kernel_L1_M4_22




ctrmm_kernel_L1_M4_40:
.Lctrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M4_100
ble .Lctrmm_kernel_L1_M4_100


ctrmm_kernel_L1_M4_42:
.Lctrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_42
bgt .Lctrmm_kernel_L1_M4_42


ctrmm_kernel_L1_M4_100:
.Lctrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


@@ -1451,22 +1451,22 @@ ctrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ctrmm_kernel_L1_M4_END:
.Lctrmm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ctrmm_kernel_L1_M4_20
bgt .Lctrmm_kernel_L1_M4_20




ctrmm_kernel_L1_M2_BEGIN:
.Lctrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L1_M1_BEGIN
ble .Lctrmm_kernel_L1_M1_BEGIN


ctrmm_kernel_L1_M2_20:
.Lctrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1490,9 +1490,9 @@ ctrmm_kernel_L1_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M2_40
ble .Lctrmm_kernel_L1_M2_40


ctrmm_kernel_L1_M2_22:
.Lctrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1505,22 +1505,22 @@ ctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_22
bgt .Lctrmm_kernel_L1_M2_22




ctrmm_kernel_L1_M2_40:
.Lctrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M2_100
ble .Lctrmm_kernel_L1_M2_100


ctrmm_kernel_L1_M2_42:
.Lctrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_42
bgt .Lctrmm_kernel_L1_M2_42


ctrmm_kernel_L1_M2_100:
.Lctrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -1540,15 +1540,15 @@ ctrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ctrmm_kernel_L1_M2_END:
.Lctrmm_kernel_L1_M2_END:




ctrmm_kernel_L1_M1_BEGIN:
.Lctrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END


ctrmm_kernel_L1_M1_20:
.Lctrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1572,9 +1572,9 @@ ctrmm_kernel_L1_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M1_40
ble .Lctrmm_kernel_L1_M1_40


ctrmm_kernel_L1_M1_22:
.Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1586,30 +1586,30 @@ ctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_22
bgt .Lctrmm_kernel_L1_M1_22




ctrmm_kernel_L1_M1_40:
.Lctrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M1_100
ble .Lctrmm_kernel_L1_M1_100


ctrmm_kernel_L1_M1_42:
.Lctrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_42
bgt .Lctrmm_kernel_L1_M1_42


ctrmm_kernel_L1_M1_100:
.Lctrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




ctrmm_kernel_L1_END:
.Lctrmm_kernel_L1_END:




ctrmm_kernel_L999:
.Lctrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 175
- 175
kernel/arm64/ctrmm_kernel_8x4.S View File

@@ -1405,11 +1405,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble ctrmm_kernel_L2_BEGIN
ble .Lctrmm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


ctrmm_kernel_L4_BEGIN:
.Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -1423,14 +1423,14 @@ ctrmm_kernel_L4_BEGIN:
#endif #endif
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


ctrmm_kernel_L4_M8_BEGIN:
.Lctrmm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L4_M4_BEGIN
ble .Lctrmm_kernel_L4_M4_BEGIN


ctrmm_kernel_L4_M8_20:
.Lctrmm_kernel_L4_M8_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -1452,7 +1452,7 @@ ctrmm_kernel_L4_M8_20:


asr counterL , tempK, #3 asr counterL , tempK, #3
cmp counterL , #2 cmp counterL , #2
blt ctrmm_kernel_L4_M8_32
blt .Lctrmm_kernel_L4_M8_32


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1464,10 +1464,10 @@ ctrmm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble ctrmm_kernel_L4_M8_22a
ble .Lctrmm_kernel_L4_M8_22a


.align 5 .align 5
ctrmm_kernel_L4_M8_22:
.Lctrmm_kernel_L4_M8_22:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1479,10 +1479,10 @@ ctrmm_kernel_L4_M8_22:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M8_22
bgt .Lctrmm_kernel_L4_M8_22


.align 5 .align 5
ctrmm_kernel_L4_M8_22a:
.Lctrmm_kernel_L4_M8_22a:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1493,13 +1493,13 @@ ctrmm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b ctrmm_kernel_L4_M8_44
b .Lctrmm_kernel_L4_M8_44


.align 5 .align 5
ctrmm_kernel_L4_M8_32:
.Lctrmm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble ctrmm_kernel_L4_M8_40
ble .Lctrmm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1510,26 +1510,26 @@ ctrmm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b ctrmm_kernel_L4_M8_44
b .Lctrmm_kernel_L4_M8_44


ctrmm_kernel_L4_M8_40:
.Lctrmm_kernel_L4_M8_40:


INIT8x4 INIT8x4


ctrmm_kernel_L4_M8_44:
.Lctrmm_kernel_L4_M8_44:


ands counterL , tempK, #7 ands counterL , tempK, #7
ble ctrmm_kernel_L4_M8_100
ble .Lctrmm_kernel_L4_M8_100


.align 5 .align 5
ctrmm_kernel_L4_M8_46:
.Lctrmm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne ctrmm_kernel_L4_M8_46
bne .Lctrmm_kernel_L4_M8_46


ctrmm_kernel_L4_M8_100:
.Lctrmm_kernel_L4_M8_100:


SAVE8x4 SAVE8x4


@@ -1552,21 +1552,21 @@ ctrmm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


ctrmm_kernel_L4_M8_END:
.Lctrmm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne ctrmm_kernel_L4_M8_20
bne .Lctrmm_kernel_L4_M8_20


ctrmm_kernel_L4_M4_BEGIN:
.Lctrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble ctrmm_kernel_L4_M2_BEGIN
ble .Lctrmm_kernel_L4_M2_BEGIN




ctrmm_kernel_L4_M4_20:
.Lctrmm_kernel_L4_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -1587,46 +1587,46 @@ ctrmm_kernel_L4_M4_20:


asr counterL , tempK, #1 // L = K / 2 asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt ctrmm_kernel_L4_M4_32
blt .Lctrmm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble ctrmm_kernel_L4_M4_22a
ble .Lctrmm_kernel_L4_M4_22a
.align 5 .align 5




ctrmm_kernel_L4_M4_22:
.Lctrmm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22
bgt .Lctrmm_kernel_L4_M4_22


ctrmm_kernel_L4_M4_22a:
.Lctrmm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_32:
b .Lctrmm_kernel_L4_M4_44
.Lctrmm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble ctrmm_kernel_L4_M4_40
ble .Lctrmm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_40:
b .Lctrmm_kernel_L4_M4_44
.Lctrmm_kernel_L4_M4_40:


INIT4x4 INIT4x4


ctrmm_kernel_L4_M4_44:
.Lctrmm_kernel_L4_M4_44:
ands counterL , tempK, #1 ands counterL , tempK, #1
ble ctrmm_kernel_L4_M4_100
ble .Lctrmm_kernel_L4_M4_100


ctrmm_kernel_L4_M4_46:
.Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


ctrmm_kernel_L4_M4_100:
.Lctrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


@@ -1645,18 +1645,18 @@ ctrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ctrmm_kernel_L4_M4_END:
.Lctrmm_kernel_L4_M4_END:


ctrmm_kernel_L4_M2_BEGIN:
.Lctrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L4_M1_BEGIN
ble .Lctrmm_kernel_L4_M1_BEGIN


ctrmm_kernel_L4_M2_20:
.Lctrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1679,9 +1679,9 @@ ctrmm_kernel_L4_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L4_M2_40
ble .Lctrmm_kernel_L4_M2_40


ctrmm_kernel_L4_M2_22:
.Lctrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1694,22 +1694,22 @@ ctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_22
bgt .Lctrmm_kernel_L4_M2_22




ctrmm_kernel_L4_M2_40:
.Lctrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M2_100
ble .Lctrmm_kernel_L4_M2_100


ctrmm_kernel_L4_M2_42:
.Lctrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_42
bgt .Lctrmm_kernel_L4_M2_42


ctrmm_kernel_L4_M2_100:
.Lctrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -1729,15 +1729,15 @@ ctrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ctrmm_kernel_L4_M2_END:
.Lctrmm_kernel_L4_M2_END:




ctrmm_kernel_L4_M1_BEGIN:
.Lctrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END


ctrmm_kernel_L4_M1_20:
.Lctrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1761,9 +1761,9 @@ ctrmm_kernel_L4_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L4_M1_40
ble .Lctrmm_kernel_L4_M1_40


ctrmm_kernel_L4_M1_22:
.Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1775,22 +1775,22 @@ ctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_22
bgt .Lctrmm_kernel_L4_M1_22




ctrmm_kernel_L4_M1_40:
.Lctrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M1_100
ble .Lctrmm_kernel_L4_M1_100


ctrmm_kernel_L4_M1_42:
.Lctrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_42
bgt .Lctrmm_kernel_L4_M1_42


ctrmm_kernel_L4_M1_100:
.Lctrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -1810,7 +1810,7 @@ ctrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


ctrmm_kernel_L4_END:
.Lctrmm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@@ -1820,19 +1820,19 @@ ctrmm_kernel_L4_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt ctrmm_kernel_L4_BEGIN
bgt .Lctrmm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble ctrmm_kernel_L999 // error, N was less than 4?
ble .Lctrmm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble ctrmm_kernel_L1_BEGIN
ble .Lctrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1843,14 +1843,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A


ctrmm_kernel_L2_M8_BEGIN:
.Lctrmm_kernel_L2_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L2_M4_BEGIN
ble .Lctrmm_kernel_L2_M4_BEGIN


ctrmm_kernel_L2_M8_20:
.Lctrmm_kernel_L2_M8_20:


INIT8x2 INIT8x2


@@ -1874,10 +1874,10 @@ ctrmm_kernel_L2_M8_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M8_40
ble .Lctrmm_kernel_L2_M8_40
.align 5 .align 5


ctrmm_kernel_L2_M8_22:
.Lctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
@@ -1889,22 +1889,22 @@ ctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M8_22
bgt .Lctrmm_kernel_L2_M8_22




ctrmm_kernel_L2_M8_40:
.Lctrmm_kernel_L2_M8_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M8_100
ble .Lctrmm_kernel_L2_M8_100


ctrmm_kernel_L2_M8_42:
.Lctrmm_kernel_L2_M8_42:


KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M8_42
bgt .Lctrmm_kernel_L2_M8_42


ctrmm_kernel_L2_M8_100:
.Lctrmm_kernel_L2_M8_100:


SAVE8x2 SAVE8x2


@@ -1924,21 +1924,21 @@ ctrmm_kernel_L2_M8_100:
add tempOffset, tempOffset, #8 add tempOffset, tempOffset, #8
#endif #endif


ctrmm_kernel_L2_M8_END:
.Lctrmm_kernel_L2_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ctrmm_kernel_L2_M8_20
bgt .Lctrmm_kernel_L2_M8_20


ctrmm_kernel_L2_M4_BEGIN:
.Lctrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble ctrmm_kernel_L2_M2_BEGIN
ble .Lctrmm_kernel_L2_M2_BEGIN


ctrmm_kernel_L2_M4_20:
.Lctrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1962,10 +1962,10 @@ ctrmm_kernel_L2_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M4_40
ble .Lctrmm_kernel_L2_M4_40
.align 5 .align 5


ctrmm_kernel_L2_M4_22:
.Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1977,22 +1977,22 @@ ctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_22
bgt .Lctrmm_kernel_L2_M4_22




ctrmm_kernel_L2_M4_40:
.Lctrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M4_100
ble .Lctrmm_kernel_L2_M4_100


ctrmm_kernel_L2_M4_42:
.Lctrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_42
bgt .Lctrmm_kernel_L2_M4_42


ctrmm_kernel_L2_M4_100:
.Lctrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


@@ -2012,19 +2012,19 @@ ctrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ctrmm_kernel_L2_M4_END:
.Lctrmm_kernel_L2_M4_END:




ctrmm_kernel_L2_M2_BEGIN:
.Lctrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L2_M1_BEGIN
ble .Lctrmm_kernel_L2_M1_BEGIN


ctrmm_kernel_L2_M2_20:
.Lctrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -2048,9 +2048,9 @@ ctrmm_kernel_L2_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M2_40
ble .Lctrmm_kernel_L2_M2_40


ctrmm_kernel_L2_M2_22:
.Lctrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -2063,22 +2063,22 @@ ctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_22
bgt .Lctrmm_kernel_L2_M2_22




ctrmm_kernel_L2_M2_40:
.Lctrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M2_100
ble .Lctrmm_kernel_L2_M2_100


ctrmm_kernel_L2_M2_42:
.Lctrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_42
bgt .Lctrmm_kernel_L2_M2_42


ctrmm_kernel_L2_M2_100:
.Lctrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


@@ -2098,15 +2098,15 @@ ctrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ctrmm_kernel_L2_M2_END:
.Lctrmm_kernel_L2_M2_END:




ctrmm_kernel_L2_M1_BEGIN:
.Lctrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END


ctrmm_kernel_L2_M1_20:
.Lctrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -2130,9 +2130,9 @@ ctrmm_kernel_L2_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble ctrmm_kernel_L2_M1_40
ble .Lctrmm_kernel_L2_M1_40


ctrmm_kernel_L2_M1_22:
.Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -2144,22 +2144,22 @@ ctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_22
bgt .Lctrmm_kernel_L2_M1_22




ctrmm_kernel_L2_M1_40:
.Lctrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M1_100
ble .Lctrmm_kernel_L2_M1_100


ctrmm_kernel_L2_M1_42:
.Lctrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_42
bgt .Lctrmm_kernel_L2_M1_42


ctrmm_kernel_L2_M1_100:
.Lctrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -2179,7 +2179,7 @@ ctrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


ctrmm_kernel_L2_END:
.Lctrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -2187,11 +2187,11 @@ ctrmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


ctrmm_kernel_L1_BEGIN:
.Lctrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble ctrmm_kernel_L999 // done
ble .Lctrmm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next add pC , pC , LDC // Update pC to point to next
@@ -2201,14 +2201,14 @@ ctrmm_kernel_L1_BEGIN:
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A


ctrmm_kernel_L1_M8_BEGIN:
.Lctrmm_kernel_L1_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L1_M4_BEGIN
ble .Lctrmm_kernel_L1_M4_BEGIN


ctrmm_kernel_L1_M8_20:
.Lctrmm_kernel_L1_M8_20:


INIT8x1 INIT8x1


@@ -2232,10 +2232,10 @@ ctrmm_kernel_L1_M8_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M8_40
ble .Lctrmm_kernel_L1_M8_40
.align 5 .align 5


ctrmm_kernel_L1_M8_22:
.Lctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@@ -2247,22 +2247,22 @@ ctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M8_22
bgt .Lctrmm_kernel_L1_M8_22




ctrmm_kernel_L1_M8_40:
.Lctrmm_kernel_L1_M8_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M8_100
ble .Lctrmm_kernel_L1_M8_100


ctrmm_kernel_L1_M8_42:
.Lctrmm_kernel_L1_M8_42:


KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M8_42
bgt .Lctrmm_kernel_L1_M8_42


ctrmm_kernel_L1_M8_100:
.Lctrmm_kernel_L1_M8_100:


SAVE8x1 SAVE8x1


@@ -2282,21 +2282,21 @@ ctrmm_kernel_L1_M8_100:
add tempOffset, tempOffset, #8 add tempOffset, tempOffset, #8
#endif #endif


ctrmm_kernel_L1_M8_END:
.Lctrmm_kernel_L1_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ctrmm_kernel_L1_M8_20
bgt .Lctrmm_kernel_L1_M8_20


ctrmm_kernel_L1_M4_BEGIN:
.Lctrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble ctrmm_kernel_L1_M2_BEGIN
ble .Lctrmm_kernel_L1_M2_BEGIN


ctrmm_kernel_L1_M4_20:
.Lctrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -2319,10 +2319,10 @@ ctrmm_kernel_L1_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M4_40
ble .Lctrmm_kernel_L1_M4_40
.align 5 .align 5


ctrmm_kernel_L1_M4_22:
.Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -2334,22 +2334,22 @@ ctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_22
bgt .Lctrmm_kernel_L1_M4_22




ctrmm_kernel_L1_M4_40:
.Lctrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M4_100
ble .Lctrmm_kernel_L1_M4_100


ctrmm_kernel_L1_M4_42:
.Lctrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_42
bgt .Lctrmm_kernel_L1_M4_42


ctrmm_kernel_L1_M4_100:
.Lctrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


@@ -2369,18 +2369,18 @@ ctrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ctrmm_kernel_L1_M4_END:
.Lctrmm_kernel_L1_M4_END:


ctrmm_kernel_L1_M2_BEGIN:
.Lctrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L1_M1_BEGIN
ble .Lctrmm_kernel_L1_M1_BEGIN


ctrmm_kernel_L1_M2_20:
.Lctrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -2404,9 +2404,9 @@ ctrmm_kernel_L1_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M2_40
ble .Lctrmm_kernel_L1_M2_40


ctrmm_kernel_L1_M2_22:
.Lctrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -2419,22 +2419,22 @@ ctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_22
bgt .Lctrmm_kernel_L1_M2_22




ctrmm_kernel_L1_M2_40:
.Lctrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M2_100
ble .Lctrmm_kernel_L1_M2_100


ctrmm_kernel_L1_M2_42:
.Lctrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_42
bgt .Lctrmm_kernel_L1_M2_42


ctrmm_kernel_L1_M2_100:
.Lctrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -2454,15 +2454,15 @@ ctrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ctrmm_kernel_L1_M2_END:
.Lctrmm_kernel_L1_M2_END:




ctrmm_kernel_L1_M1_BEGIN:
.Lctrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END


ctrmm_kernel_L1_M1_20:
.Lctrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -2486,9 +2486,9 @@ ctrmm_kernel_L1_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M1_40
ble .Lctrmm_kernel_L1_M1_40


ctrmm_kernel_L1_M1_22:
.Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -2500,30 +2500,30 @@ ctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_22
bgt .Lctrmm_kernel_L1_M1_22




ctrmm_kernel_L1_M1_40:
.Lctrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M1_100
ble .Lctrmm_kernel_L1_M1_100


ctrmm_kernel_L1_M1_42:
.Lctrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_42
bgt .Lctrmm_kernel_L1_M1_42


ctrmm_kernel_L1_M1_100:
.Lctrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




ctrmm_kernel_L1_END:
.Lctrmm_kernel_L1_END:




ctrmm_kernel_L999:
.Lctrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 22
- 22
kernel/arm64/daxpy_thunderx2t99.S View File

@@ -122,53 +122,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999


fcmp DA, #0.0 fcmp DA, #0.0
beq axpy_kernel_L999
beq .Ldaxpy_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne axpy_kernel_S_BEGIN
bne .Ldaxpy_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne axpy_kernel_S_BEGIN
bne .Ldaxpy_kernel_S_BEGIN


axpy_kernel_F_BEGIN:
.Ldaxpy_kernel_F_BEGIN:


asr I, N, #5 asr I, N, #5
cmp I, xzr cmp I, xzr
beq axpy_kernel_F1
beq .Ldaxpy_kernel_F1


.align 5 .align 5
axpy_kernel_F32:
.Ldaxpy_kernel_F32:


KERNEL_F32 KERNEL_F32


subs I, I, #1 subs I, I, #1
bne axpy_kernel_F32
bne .Ldaxpy_kernel_F32


axpy_kernel_F1:
.Ldaxpy_kernel_F1:


ands I, N, #31 ands I, N, #31
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999


axpy_kernel_F10:
.Ldaxpy_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne axpy_kernel_F10
bne .Ldaxpy_kernel_F10


b axpy_kernel_L999
b .Ldaxpy_kernel_L999


axpy_kernel_S_BEGIN:
.Ldaxpy_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble axpy_kernel_S1
ble .Ldaxpy_kernel_S1


axpy_kernel_S4:
.Ldaxpy_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -176,21 +176,21 @@ axpy_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne axpy_kernel_S4
bne .Ldaxpy_kernel_S4


axpy_kernel_S1:
.Ldaxpy_kernel_S1:


ands I, N, #3 ands I, N, #3
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999


axpy_kernel_S10:
.Ldaxpy_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne axpy_kernel_S10
bne .Ldaxpy_kernel_S10


axpy_kernel_L999:
.Ldaxpy_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret

+ 143
- 143
kernel/arm64/dgemm_kernel_4x4.S View File

@@ -775,9 +775,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN


dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -791,20 +791,20 @@ dgemm_kernel_L4_BEGIN:


//------------------------------------------------------------------------------ //------------------------------------------------------------------------------


dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #2 // L = K / 4 asr counterL , origK, #2 // L = K / 4
cmp counterL , #2 cmp counterL , #2
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -812,60 +812,60 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a


.align 5 .align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22


.align 5 .align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44


.align 5 .align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44




dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:


INIT8x4 INIT8x4


dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:


ands counterL , origK, #3 ands counterL , origK, #3
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100


.align 5 .align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46


dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
lsl temp, origK, #5 lsl temp, origK, #5
prfm PLDL1KEEP, [pA, temp] prfm PLDL1KEEP, [pA, temp]
prfm PLDL1KEEP, [ppA, temp] prfm PLDL1KEEP, [ppA, temp]
@@ -873,31 +873,31 @@ dgemm_kernel_L4_M8_100:


SAVE8x4 SAVE8x4


dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8 lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp add pA, pA, temp
add ppA, ppA, temp add ppA, ppA, temp
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20


dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN


dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:


INIT4x4 INIT4x4


mov pB, origPB mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8 asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40


dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:


KERNEL4x4_SUB KERNEL4x4_SUB
KERNEL4x4_SUB KERNEL4x4_SUB
@@ -910,47 +910,47 @@ dgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22




dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100


dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:


KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42


dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:




dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN


dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40


dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -963,43 +963,43 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22




dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100


dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42


dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:




dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40


dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1011,45 +1011,45 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22




dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100


dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42


dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1059,24 +1059,24 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction






dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN


dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5 .align 5


dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1088,50 +1088,50 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22




dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100


dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42


dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20
bgt .Ldgemm_kernel_L2_M4_20




dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN


dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40


dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1144,43 +1144,43 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22




dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100


dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42


dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:




dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40


dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1192,36 +1192,36 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22




dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100


dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42


dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1231,24 +1231,24 @@ dgemm_kernel_L1_BEGIN:






dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN


dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5 .align 5


dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1260,50 +1260,50 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22




dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100


dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42


dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20
bgt .Ldgemm_kernel_L1_M4_20




dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN


dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40


dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1316,43 +1316,43 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22




dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100


dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42


dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:




dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40


dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1364,30 +1364,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22




dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100


dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42


dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:




dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 176
- 176
kernel/arm64/dgemm_kernel_4x8.S View File

@@ -938,98 +938,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8 asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0 cmp counterJ, #0
ble dgemm_kernel_L4_BEGIN
ble .Ldgemm_kernel_L4_BEGIN


/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L8_BEGIN:
.Ldgemm_kernel_L8_BEGIN:


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3 add pC, pC, LDC, lsl #3


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dgemm_kernel_L8_M4_BEGIN:
.Ldgemm_kernel_L8_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L8_M2_BEGIN
ble .Ldgemm_kernel_L8_M2_BEGIN


dgemm_kernel_L8_M4_20:
.Ldgemm_kernel_L8_M4_20:


mov pB, origPB mov pB, origPB


asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L8_M4_32
blt .Ldgemm_kernel_L8_M4_32


KERNEL4x8_I // do one in the K KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K KERNEL4x8_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble dgemm_kernel_L8_M4_22a
ble .Ldgemm_kernel_L8_M4_22a
.align 5 .align 5


dgemm_kernel_L8_M4_22:
.Ldgemm_kernel_L8_M4_22:


KERNEL4x8_M1 KERNEL4x8_M1
KERNEL4x8_M2 KERNEL4x8_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M4_22
bgt .Ldgemm_kernel_L8_M4_22




dgemm_kernel_L8_M4_22a:
.Ldgemm_kernel_L8_M4_22a:


KERNEL4x8_M1 KERNEL4x8_M1
KERNEL4x8_E KERNEL4x8_E


b dgemm_kernel_L8_M4_44
b .Ldgemm_kernel_L8_M4_44


dgemm_kernel_L8_M4_32:
.Ldgemm_kernel_L8_M4_32:


tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L8_M4_40
ble .Ldgemm_kernel_L8_M4_40


KERNEL4x8_I KERNEL4x8_I


KERNEL4x8_E KERNEL4x8_E


b dgemm_kernel_L8_M4_44
b .Ldgemm_kernel_L8_M4_44




dgemm_kernel_L8_M4_40:
.Ldgemm_kernel_L8_M4_40:


INIT4x8 INIT4x8


dgemm_kernel_L8_M4_44:
.Ldgemm_kernel_L8_M4_44:


ands counterL , origK, #1 ands counterL , origK, #1
ble dgemm_kernel_L8_M4_100
ble .Ldgemm_kernel_L8_M4_100


dgemm_kernel_L8_M4_46:
.Ldgemm_kernel_L8_M4_46:


KERNEL4x8_SUB KERNEL4x8_SUB


dgemm_kernel_L8_M4_100:
.Ldgemm_kernel_L8_M4_100:


SAVE4x8 SAVE4x8


dgemm_kernel_L8_M4_END:
.Ldgemm_kernel_L8_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L8_M4_20
bne .Ldgemm_kernel_L8_M4_20


dgemm_kernel_L8_M2_BEGIN:
.Ldgemm_kernel_L8_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L8_END
ble .Ldgemm_kernel_L8_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L8_M1_BEGIN
ble .Ldgemm_kernel_L8_M1_BEGIN


dgemm_kernel_L8_M2_20:
.Ldgemm_kernel_L8_M2_20:


INIT2x8 INIT2x8


@@ -1037,9 +1037,9 @@ dgemm_kernel_L8_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L8_M2_40
ble .Ldgemm_kernel_L8_M2_40


dgemm_kernel_L8_M2_22:
.Ldgemm_kernel_L8_M2_22:


KERNEL2x8_SUB KERNEL2x8_SUB
KERNEL2x8_SUB KERNEL2x8_SUB
@@ -1052,34 +1052,34 @@ dgemm_kernel_L8_M2_22:
KERNEL2x8_SUB KERNEL2x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M2_22
bgt .Ldgemm_kernel_L8_M2_22




dgemm_kernel_L8_M2_40:
.Ldgemm_kernel_L8_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L8_M2_100
ble .Ldgemm_kernel_L8_M2_100


dgemm_kernel_L8_M2_42:
.Ldgemm_kernel_L8_M2_42:


KERNEL2x8_SUB KERNEL2x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M2_42
bgt .Ldgemm_kernel_L8_M2_42


dgemm_kernel_L8_M2_100:
.Ldgemm_kernel_L8_M2_100:


SAVE2x8 SAVE2x8


dgemm_kernel_L8_M2_END:
.Ldgemm_kernel_L8_M2_END:




dgemm_kernel_L8_M1_BEGIN:
.Ldgemm_kernel_L8_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L8_END
ble .Ldgemm_kernel_L8_END


dgemm_kernel_L8_M1_20:
.Ldgemm_kernel_L8_M1_20:


INIT1x8 INIT1x8


@@ -1087,9 +1087,9 @@ dgemm_kernel_L8_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L8_M1_40
ble .Ldgemm_kernel_L8_M1_40


dgemm_kernel_L8_M1_22:
.Ldgemm_kernel_L8_M1_22:
KERNEL1x8_SUB KERNEL1x8_SUB
KERNEL1x8_SUB KERNEL1x8_SUB
KERNEL1x8_SUB KERNEL1x8_SUB
@@ -1101,131 +1101,131 @@ dgemm_kernel_L8_M1_22:
KERNEL1x8_SUB KERNEL1x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M1_22
bgt .Ldgemm_kernel_L8_M1_22




dgemm_kernel_L8_M1_40:
.Ldgemm_kernel_L8_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L8_M1_100
ble .Ldgemm_kernel_L8_M1_100


dgemm_kernel_L8_M1_42:
.Ldgemm_kernel_L8_M1_42:


KERNEL1x8_SUB KERNEL1x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M1_42
bgt .Ldgemm_kernel_L8_M1_42


dgemm_kernel_L8_M1_100:
.Ldgemm_kernel_L8_M1_100:


SAVE1x8 SAVE1x8


dgemm_kernel_L8_END:
.Ldgemm_kernel_L8_END:


lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8 add origPB, origPB, temp // B = B + K * 8 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L8_BEGIN
bgt .Ldgemm_kernel_L8_BEGIN




/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #7 tst counterJ , #7
ble dgemm_kernel_L999
ble .Ldgemm_kernel_L999


tst counterJ , #4 tst counterJ , #4
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN


dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:


mov pB, origPB mov pB, origPB


asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M4_32
blt .Ldgemm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble dgemm_kernel_L4_M4_22a
ble .Ldgemm_kernel_L4_M4_22a
.align 5 .align 5


dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22




dgemm_kernel_L4_M4_22a:
.Ldgemm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b dgemm_kernel_L4_M4_44
b .Ldgemm_kernel_L4_M4_44


dgemm_kernel_L4_M4_32:
.Ldgemm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I


KERNEL4x4_E KERNEL4x4_E


b dgemm_kernel_L4_M4_44
b .Ldgemm_kernel_L4_M4_44




dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:


INIT4x4 INIT4x4


dgemm_kernel_L4_M4_44:
.Ldgemm_kernel_L4_M4_44:


ands counterL , origK, #1 ands counterL , origK, #1
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100


dgemm_kernel_L4_M4_46:
.Ldgemm_kernel_L4_M4_46:


KERNEL4x4_SUB KERNEL4x4_SUB


dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M4_20
bne .Ldgemm_kernel_L4_M4_20


dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN


dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1233,9 +1233,9 @@ dgemm_kernel_L4_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40


dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1248,34 +1248,34 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22




dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100


dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42


dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:




dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1283,9 +1283,9 @@ dgemm_kernel_L4_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40


dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1297,40 +1297,40 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22




dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100


dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42


dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1339,14 +1339,14 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A




dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN


dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1354,10 +1354,10 @@ dgemm_kernel_L2_M4_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5 .align 5


dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1369,41 +1369,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22




dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100


dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42


dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20
bgt .Ldgemm_kernel_L2_M4_20




dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN


dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1411,9 +1411,9 @@ dgemm_kernel_L2_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40


dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1426,34 +1426,34 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22




dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100


dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42


dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:




dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1461,9 +1461,9 @@ dgemm_kernel_L2_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40


dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1475,35 +1475,35 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22




dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100


dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42


dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1511,24 +1511,24 @@ dgemm_kernel_L1_BEGIN:


mov pA, origPA // pA = A mov pA, origPA // pA = A


dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN


dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5 .align 5


dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1540,41 +1540,41 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22




dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100


dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42


dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20
bgt .Ldgemm_kernel_L1_M4_20




dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN


dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1582,9 +1582,9 @@ dgemm_kernel_L1_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40


dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1597,34 +1597,34 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22




dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100


dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42


dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:




dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1632,9 +1632,9 @@ dgemm_kernel_L1_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40


dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1646,30 +1646,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22




dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100


dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42


dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:




dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 169
- 169
kernel/arm64/dgemm_kernel_8x4.S View File

@@ -885,12 +885,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


.align 5 .align 5
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -900,21 +900,21 @@ dgemm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:


mov pB, origPB mov pB, origPB


asr counterL , origK, #3 // L = K / 8 asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -926,10 +926,10 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a


.align 5 .align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -941,10 +941,10 @@ dgemm_kernel_L4_M8_22:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22


.align 5 .align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -955,13 +955,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44


.align 5 .align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -972,46 +972,46 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44


dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:


INIT8x4 INIT8x4


dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:


ands counterL , origK, #7 ands counterL , origK, #7
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100


.align 5 .align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46


dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


SAVE8x4 SAVE8x4


dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20


dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN


dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:


INIT4x4 INIT4x4


@@ -1019,10 +1019,10 @@ dgemm_kernel_L4_M4_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40


.align 5 .align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:


KERNEL4x4_SUB KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1043,38 +1043,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22


dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100


dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:


KERNEL4x4_SUB KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42


dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:


dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN


dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1082,10 +1082,10 @@ dgemm_kernel_L4_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40


.align 5 .align 5
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1104,37 +1104,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22




dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100


prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42


dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:




dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1142,10 +1142,10 @@ dgemm_kernel_L4_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40


.align 5 .align 5
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1163,46 +1163,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22




dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100


prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42


dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN


mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
@@ -1211,15 +1211,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov pA, origPA // pA = A mov pA, origPA // pA = A


dgemm_kernel_L2_M8_BEGIN:
.Ldgemm_kernel_L2_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
ble .Ldgemm_kernel_L2_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L2_M8_20:
.Ldgemm_kernel_L2_M8_20:


INIT8x2 INIT8x2


@@ -1227,10 +1227,10 @@ dgemm_kernel_L2_M8_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M8_40
ble .Ldgemm_kernel_L2_M8_40


.align 5 .align 5
dgemm_kernel_L2_M8_22:
.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1244,41 +1244,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
bgt .Ldgemm_kernel_L2_M8_22


dgemm_kernel_L2_M8_40:
.Ldgemm_kernel_L2_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
ble .Ldgemm_kernel_L2_M8_100


prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M8_42:
.Ldgemm_kernel_L2_M8_42:


KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42
bgt .Ldgemm_kernel_L2_M8_42


dgemm_kernel_L2_M8_100:
.Ldgemm_kernel_L2_M8_100:


SAVE8x2 SAVE8x2


dgemm_kernel_L2_M8_END:
.Ldgemm_kernel_L2_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20
bgt .Ldgemm_kernel_L2_M8_20


dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN


dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1286,10 +1286,10 @@ dgemm_kernel_L2_M4_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40


.align 5 .align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1307,41 +1307,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22




dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100


prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42


dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:




dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN


dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1349,9 +1349,9 @@ dgemm_kernel_L2_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40


dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1368,37 +1368,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22


prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100


dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42


dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:




dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1406,9 +1406,9 @@ dgemm_kernel_L2_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40


dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1424,62 +1424,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22


prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100


dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42


dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next add pC , pC , LDC // Update pC to point to next


mov pA, origPA // pA = A mov pA, origPA // pA = A


dgemm_kernel_L1_M8_BEGIN:
.Ldgemm_kernel_L1_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
ble .Ldgemm_kernel_L1_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L1_M8_20:
.Ldgemm_kernel_L1_M8_20:


INIT8x1 INIT8x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M8_40
ble .Ldgemm_kernel_L1_M8_40


.align 5 .align 5
dgemm_kernel_L1_M8_22:
.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@@ -1493,51 +1493,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22
bgt .Ldgemm_kernel_L1_M8_22




dgemm_kernel_L1_M8_40:
.Ldgemm_kernel_L1_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
ble .Ldgemm_kernel_L1_M8_100


prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
.Ldgemm_kernel_L1_M8_42:


KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42
bgt .Ldgemm_kernel_L1_M8_42


dgemm_kernel_L1_M8_100:
.Ldgemm_kernel_L1_M8_100:


SAVE8x1 SAVE8x1


dgemm_kernel_L1_M8_END:
.Ldgemm_kernel_L1_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20
bgt .Ldgemm_kernel_L1_M8_20


dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN


dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40


.align 5 .align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1555,39 +1555,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22




dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100


prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42


dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:


dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN


dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1595,9 +1595,9 @@ dgemm_kernel_L1_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40


dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1614,36 +1614,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22


prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100


dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42


dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:




dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1651,10 +1651,10 @@ dgemm_kernel_L1_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40




dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
@@ -1668,32 +1668,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22




dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100


prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42


dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:




dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 169
- 169
kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S View File

@@ -962,12 +962,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


.align 5 .align 5
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -977,21 +977,21 @@ dgemm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:


mov pB, origPB mov pB, origPB


asr counterL , origK, #7 // L = K / 128 asr counterL , origK, #7 // L = K / 128
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1003,18 +1003,18 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M1_M2_x1 KERNEL8x4_M1_M2_x1


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a


.align 5 .align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:


KERNEL8x4_M1_M2_x64 KERNEL8x4_M1_M2_x64


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22


.align 5 .align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:


KERNEL8x4_M1_M2_x32 KERNEL8x4_M1_M2_x32
KERNEL8x4_M1_M2_x16 KERNEL8x4_M1_M2_x16
@@ -1025,13 +1025,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44


.align 5 .align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1043,26 +1043,26 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44


dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:


INIT8x4 INIT8x4


dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:


ands counterL , origK, #127 ands counterL , origK, #127
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100


.align 5 .align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46


dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE]
@@ -1073,20 +1073,20 @@ dgemm_kernel_L4_M8_100:


SAVE8x4 SAVE8x4


dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20


dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN


dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:


INIT4x4 INIT4x4


@@ -1094,10 +1094,10 @@ dgemm_kernel_L4_M4_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40


.align 5 .align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:


KERNEL4x4_SUB KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1118,38 +1118,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22


dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100


dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:


KERNEL4x4_SUB KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42


dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:


dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN


dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1157,10 +1157,10 @@ dgemm_kernel_L4_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40


.align 5 .align 5
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1179,37 +1179,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22




dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100


prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42


dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:




dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1217,10 +1217,10 @@ dgemm_kernel_L4_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40


.align 5 .align 5
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1238,46 +1238,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22




dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100


prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42


dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN


mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
@@ -1286,15 +1286,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov pA, origPA // pA = A mov pA, origPA // pA = A


dgemm_kernel_L2_M8_BEGIN:
.Ldgemm_kernel_L2_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
ble .Ldgemm_kernel_L2_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L2_M8_20:
.Ldgemm_kernel_L2_M8_20:


INIT8x2 INIT8x2


@@ -1302,10 +1302,10 @@ dgemm_kernel_L2_M8_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M8_40
ble .Ldgemm_kernel_L2_M8_40


.align 5 .align 5
dgemm_kernel_L2_M8_22:
.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1319,41 +1319,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
bgt .Ldgemm_kernel_L2_M8_22


dgemm_kernel_L2_M8_40:
.Ldgemm_kernel_L2_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
ble .Ldgemm_kernel_L2_M8_100


prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M8_42:
.Ldgemm_kernel_L2_M8_42:


KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42
bgt .Ldgemm_kernel_L2_M8_42


dgemm_kernel_L2_M8_100:
.Ldgemm_kernel_L2_M8_100:


SAVE8x2 SAVE8x2


dgemm_kernel_L2_M8_END:
.Ldgemm_kernel_L2_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20
bgt .Ldgemm_kernel_L2_M8_20


dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN


dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1361,10 +1361,10 @@ dgemm_kernel_L2_M4_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40


.align 5 .align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1382,41 +1382,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22




dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100


prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42


dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:




dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN


dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1424,9 +1424,9 @@ dgemm_kernel_L2_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40


dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1443,37 +1443,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22


prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100


dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42


dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:




dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1481,9 +1481,9 @@ dgemm_kernel_L2_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40


dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1499,62 +1499,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22


prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100


dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42


dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next add pC , pC , LDC // Update pC to point to next


mov pA, origPA // pA = A mov pA, origPA // pA = A


dgemm_kernel_L1_M8_BEGIN:
.Ldgemm_kernel_L1_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
ble .Ldgemm_kernel_L1_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L1_M8_20:
.Ldgemm_kernel_L1_M8_20:


INIT8x1 INIT8x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M8_40
ble .Ldgemm_kernel_L1_M8_40


.align 5 .align 5
dgemm_kernel_L1_M8_22:
.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@@ -1568,51 +1568,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22
bgt .Ldgemm_kernel_L1_M8_22




dgemm_kernel_L1_M8_40:
.Ldgemm_kernel_L1_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
ble .Ldgemm_kernel_L1_M8_100


prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
.Ldgemm_kernel_L1_M8_42:


KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42
bgt .Ldgemm_kernel_L1_M8_42


dgemm_kernel_L1_M8_100:
.Ldgemm_kernel_L1_M8_100:


SAVE8x1 SAVE8x1


dgemm_kernel_L1_M8_END:
.Ldgemm_kernel_L1_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20
bgt .Ldgemm_kernel_L1_M8_20


dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN


dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40


.align 5 .align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1630,39 +1630,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22




dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100


prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42


dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:


dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN


dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1670,9 +1670,9 @@ dgemm_kernel_L1_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40


dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1689,36 +1689,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22


prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100


dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42


dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:




dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1726,10 +1726,10 @@ dgemm_kernel_L1_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40




dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
@@ -1743,32 +1743,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22




dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100


prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42


dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:




dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 36
- 36
kernel/arm64/dgemm_ncopy_4.S View File

@@ -192,14 +192,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


lsl LDA, LDA, #3 // LDA = LDA * SIZE lsl LDA, LDA, #3 // LDA = LDA * SIZE


dgemm_ncopy_L4_BEGIN:
.Ldgemm_ncopy_L4_BEGIN:


asr J, N, #2 // J = N / 4 asr J, N, #2 // J = N / 4
cmp J, #0 cmp J, #0
ble dgemm_ncopy_L2_BEGIN
ble .Ldgemm_ncopy_L2_BEGIN


.align 5 .align 5
dgemm_ncopy_L4_M4_BEGIN:
.Ldgemm_ncopy_L4_M4_BEGIN:


mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
@@ -209,128 +209,128 @@ dgemm_ncopy_L4_M4_BEGIN:


asr I, M, #2 // I = M / 4 asr I, M, #2 // I = M / 4
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L4_M4_40
ble .Ldgemm_ncopy_L4_M4_40


.align 5 .align 5
dgemm_ncopy_L4_M4_20:
.Ldgemm_ncopy_L4_M4_20:


COPY4x4 COPY4x4


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L4_M4_20
bne .Ldgemm_ncopy_L4_M4_20




dgemm_ncopy_L4_M4_40:
.Ldgemm_ncopy_L4_M4_40:


and I, M , #3 and I, M , #3
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L4_M4_END
ble .Ldgemm_ncopy_L4_M4_END


.align 5 .align 5
dgemm_ncopy_L4_M4_60:
.Ldgemm_ncopy_L4_M4_60:


COPY1x4 COPY1x4


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L4_M4_60
bne .Ldgemm_ncopy_L4_M4_60




dgemm_ncopy_L4_M4_END:
.Ldgemm_ncopy_L4_M4_END:


subs J , J, #1 // j-- subs J , J, #1 // j--
bne dgemm_ncopy_L4_M4_BEGIN
bne .Ldgemm_ncopy_L4_M4_BEGIN






/*********************************************************************************************/ /*********************************************************************************************/


dgemm_ncopy_L2_BEGIN:
.Ldgemm_ncopy_L2_BEGIN:


tst N, #3 tst N, #3
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999


tst N, #2 tst N, #2
ble dgemm_ncopy_L1_BEGIN
ble .Ldgemm_ncopy_L1_BEGIN


dgemm_ncopy_L2_M4_BEGIN:
.Ldgemm_ncopy_L2_M4_BEGIN:
mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
add A00, A02, LDA add A00, A02, LDA


asr I, M, #2 // I = M / 4 asr I, M, #2 // I = M / 4
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L2_M4_40
ble .Ldgemm_ncopy_L2_M4_40


.align 5 .align 5
dgemm_ncopy_L2_M4_20:
.Ldgemm_ncopy_L2_M4_20:


COPY4x2 COPY4x2


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L2_M4_20
bne .Ldgemm_ncopy_L2_M4_20




dgemm_ncopy_L2_M4_40:
.Ldgemm_ncopy_L2_M4_40:


and I, M , #3 and I, M , #3
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L2_M4_END
ble .Ldgemm_ncopy_L2_M4_END


.align 5 .align 5
dgemm_ncopy_L2_M4_60:
.Ldgemm_ncopy_L2_M4_60:


COPY1x2 COPY1x2


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L2_M4_60
bne .Ldgemm_ncopy_L2_M4_60




dgemm_ncopy_L2_M4_END:
.Ldgemm_ncopy_L2_M4_END:




/*********************************************************************************************/ /*********************************************************************************************/


dgemm_ncopy_L1_BEGIN:
.Ldgemm_ncopy_L1_BEGIN:


tst N, #1 tst N, #1
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999




dgemm_ncopy_L1_M4_BEGIN:
.Ldgemm_ncopy_L1_M4_BEGIN:


mov A01, A00 mov A01, A00


asr I, M, #2 // I = M / 4 asr I, M, #2 // I = M / 4
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L1_M4_40
ble .Ldgemm_ncopy_L1_M4_40


.align 5 .align 5
dgemm_ncopy_L1_M4_20:
.Ldgemm_ncopy_L1_M4_20:


COPY4x1 COPY4x1


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L1_M4_20
bne .Ldgemm_ncopy_L1_M4_20




dgemm_ncopy_L1_M4_40:
.Ldgemm_ncopy_L1_M4_40:


and I, M , #3 and I, M , #3
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L1_M4_END
ble .Ldgemm_ncopy_L1_M4_END


.align 5 .align 5
dgemm_ncopy_L1_M4_60:
.Ldgemm_ncopy_L1_M4_60:


COPY1x1 COPY1x1


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L1_M4_60
bne .Ldgemm_ncopy_L1_M4_60




dgemm_ncopy_L1_M4_END:
.Ldgemm_ncopy_L1_M4_END:


dgemm_ncopy_L999:
.Ldgemm_ncopy_L999:


mov x0, #0 mov x0, #0
RESTORE_REGS RESTORE_REGS


+ 48
- 48
kernel/arm64/dgemm_ncopy_8.S View File

@@ -353,13 +353,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


lsl LDA, LDA, #3 // LDA = LDA * SIZE lsl LDA, LDA, #3 // LDA = LDA * SIZE


dgemm_ncopy_L8_BEGIN:
.Ldgemm_ncopy_L8_BEGIN:


asr J, N, #3 // J = N / 8 asr J, N, #3 // J = N / 8
cmp J, #0 cmp J, #0
ble dgemm_ncopy_L4_BEGIN
ble .Ldgemm_ncopy_L4_BEGIN


dgemm_ncopy_L8_M8_BEGIN:
.Ldgemm_ncopy_L8_M8_BEGIN:


mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
@@ -374,46 +374,46 @@ dgemm_ncopy_L8_M8_BEGIN:


asr I, M, #3 // I = M / 8 asr I, M, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L8_M8_40
ble .Ldgemm_ncopy_L8_M8_40


dgemm_ncopy_L8_M8_20:
.Ldgemm_ncopy_L8_M8_20:


COPY8x8 COPY8x8


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L8_M8_20
bne .Ldgemm_ncopy_L8_M8_20




dgemm_ncopy_L8_M8_40:
.Ldgemm_ncopy_L8_M8_40:


and I, M , #7 and I, M , #7
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L8_M8_END
ble .Ldgemm_ncopy_L8_M8_END


dgemm_ncopy_L8_M8_60:
.Ldgemm_ncopy_L8_M8_60:


COPY1x8 COPY1x8


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L8_M8_60
bne .Ldgemm_ncopy_L8_M8_60




dgemm_ncopy_L8_M8_END:
.Ldgemm_ncopy_L8_M8_END:


subs J , J, #1 // j-- subs J , J, #1 // j--
bne dgemm_ncopy_L8_M8_BEGIN
bne .Ldgemm_ncopy_L8_M8_BEGIN


/*********************************************************************************************/ /*********************************************************************************************/


dgemm_ncopy_L4_BEGIN:
.Ldgemm_ncopy_L4_BEGIN:


tst N, #7 tst N, #7
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999


tst N, #4 tst N, #4
ble dgemm_ncopy_L2_BEGIN
ble .Ldgemm_ncopy_L2_BEGIN


dgemm_ncopy_L4_M8_BEGIN:
.Ldgemm_ncopy_L4_M8_BEGIN:


mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
@@ -423,118 +423,118 @@ dgemm_ncopy_L4_M8_BEGIN:


asr I, M, #3 // I = M / 8 asr I, M, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L4_M8_40
ble .Ldgemm_ncopy_L4_M8_40


dgemm_ncopy_L4_M8_20:
.Ldgemm_ncopy_L4_M8_20:


COPY8x4 COPY8x4


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L4_M8_20
bne .Ldgemm_ncopy_L4_M8_20




dgemm_ncopy_L4_M8_40:
.Ldgemm_ncopy_L4_M8_40:


and I, M , #7 and I, M , #7
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L4_M8_END
ble .Ldgemm_ncopy_L4_M8_END


dgemm_ncopy_L4_M8_60:
.Ldgemm_ncopy_L4_M8_60:


COPY1x4 COPY1x4


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L4_M8_60
bne .Ldgemm_ncopy_L4_M8_60




dgemm_ncopy_L4_M8_END:
.Ldgemm_ncopy_L4_M8_END:




/*********************************************************************************************/ /*********************************************************************************************/


dgemm_ncopy_L2_BEGIN:
.Ldgemm_ncopy_L2_BEGIN:


tst N, #3 tst N, #3
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999


tst N, #2 tst N, #2
ble dgemm_ncopy_L1_BEGIN
ble .Ldgemm_ncopy_L1_BEGIN


dgemm_ncopy_L2_M8_BEGIN:
.Ldgemm_ncopy_L2_M8_BEGIN:
mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
add A00, A02, LDA add A00, A02, LDA


asr I, M, #3 // I = M / 8 asr I, M, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L2_M8_40
ble .Ldgemm_ncopy_L2_M8_40


dgemm_ncopy_L2_M8_20:
.Ldgemm_ncopy_L2_M8_20:


COPY8x2 COPY8x2


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L2_M8_20
bne .Ldgemm_ncopy_L2_M8_20




dgemm_ncopy_L2_M8_40:
.Ldgemm_ncopy_L2_M8_40:


and I, M , #7 and I, M , #7
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L2_M8_END
ble .Ldgemm_ncopy_L2_M8_END


dgemm_ncopy_L2_M8_60:
.Ldgemm_ncopy_L2_M8_60:


COPY1x2 COPY1x2


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L2_M8_60
bne .Ldgemm_ncopy_L2_M8_60




dgemm_ncopy_L2_M8_END:
.Ldgemm_ncopy_L2_M8_END:




/*********************************************************************************************/ /*********************************************************************************************/


dgemm_ncopy_L1_BEGIN:
.Ldgemm_ncopy_L1_BEGIN:


tst N, #1 tst N, #1
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999




dgemm_ncopy_L1_M8_BEGIN:
.Ldgemm_ncopy_L1_M8_BEGIN:


mov A01, A00 mov A01, A00


asr I, M, #3 // I = M / 8 asr I, M, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L1_M8_40
ble .Ldgemm_ncopy_L1_M8_40


dgemm_ncopy_L1_M8_20:
.Ldgemm_ncopy_L1_M8_20:


COPY8x1 COPY8x1


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L1_M8_20
bne .Ldgemm_ncopy_L1_M8_20




dgemm_ncopy_L1_M8_40:
.Ldgemm_ncopy_L1_M8_40:


and I, M , #7 and I, M , #7
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L1_M8_END
ble .Ldgemm_ncopy_L1_M8_END


dgemm_ncopy_L1_M8_60:
.Ldgemm_ncopy_L1_M8_60:


COPY1x1 COPY1x1


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L1_M8_60
bne .Ldgemm_ncopy_L1_M8_60




dgemm_ncopy_L1_M8_END:
.Ldgemm_ncopy_L1_M8_END:


dgemm_ncopy_L999:
.Ldgemm_ncopy_L999:


mov x0, #0 mov x0, #0
RESTORE_REGS RESTORE_REGS


+ 36
- 36
kernel/arm64/dgemm_tcopy_4.S View File

@@ -247,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


lsl M4, M, #5 // M4 = M * 4 * SIZE lsl M4, M, #5 // M4 = M * 4 * SIZE


dgemm_tcopy_L4_BEGIN:
.Ldgemm_tcopy_L4_BEGIN:
asr J, M, #2 // J = M / 4 asr J, M, #2 // J = M / 4
cmp J, #0 cmp J, #0
ble dgemm_tcopy_L2_BEGIN
ble .Ldgemm_tcopy_L2_BEGIN


.align 5 .align 5
dgemm_tcopy_L4_M4_BEGIN:
.Ldgemm_tcopy_L4_M4_BEGIN:


mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
@@ -266,51 +266,51 @@ dgemm_tcopy_L4_M4_BEGIN:


asr I, N, #2 // I = N / 4 asr I, N, #2 // I = N / 4
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L4_M4_40
ble .Ldgemm_tcopy_L4_M4_40


.align 5 .align 5
dgemm_tcopy_L4_M4_20:
.Ldgemm_tcopy_L4_M4_20:


COPY4x4 COPY4x4


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L4_M4_20
bne .Ldgemm_tcopy_L4_M4_20




dgemm_tcopy_L4_M4_40:
.Ldgemm_tcopy_L4_M4_40:


tst N , #2 tst N , #2
ble dgemm_tcopy_L4_M4_60
ble .Ldgemm_tcopy_L4_M4_60


COPY2x4 COPY2x4




dgemm_tcopy_L4_M4_60:
.Ldgemm_tcopy_L4_M4_60:


tst N, #1 tst N, #1
ble dgemm_tcopy_L4_M4_END
ble .Ldgemm_tcopy_L4_M4_END


COPY1x4 COPY1x4




dgemm_tcopy_L4_M4_END:
.Ldgemm_tcopy_L4_M4_END:


subs J , J, #1 // j-- subs J , J, #1 // j--
bne dgemm_tcopy_L4_M4_BEGIN
bne .Ldgemm_tcopy_L4_M4_BEGIN






/*********************************************************************************************/ /*********************************************************************************************/


dgemm_tcopy_L2_BEGIN:
.Ldgemm_tcopy_L2_BEGIN:


tst M, #3 tst M, #3
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999


tst M, #2 tst M, #2
ble dgemm_tcopy_L1_BEGIN
ble .Ldgemm_tcopy_L1_BEGIN


dgemm_tcopy_L2_M4_BEGIN:
.Ldgemm_tcopy_L2_M4_BEGIN:
mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
add A, A02, LDA add A, A02, LDA
@@ -320,80 +320,80 @@ dgemm_tcopy_L2_M4_BEGIN:


asr I, N, #2 // I = N / 4 asr I, N, #2 // I = N / 4
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L2_M4_40
ble .Ldgemm_tcopy_L2_M4_40


.align 5 .align 5
dgemm_tcopy_L2_M4_20:
.Ldgemm_tcopy_L2_M4_20:


COPY4x2 COPY4x2


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L2_M4_20
bne .Ldgemm_tcopy_L2_M4_20




dgemm_tcopy_L2_M4_40:
.Ldgemm_tcopy_L2_M4_40:


tst N , #2 tst N , #2
ble dgemm_tcopy_L2_M4_60
ble .Ldgemm_tcopy_L2_M4_60


COPY2x2 COPY2x2


dgemm_tcopy_L2_M4_60:
.Ldgemm_tcopy_L2_M4_60:


tst N , #1 tst N , #1
ble dgemm_tcopy_L2_M4_END
ble .Ldgemm_tcopy_L2_M4_END


COPY1x2 COPY1x2




dgemm_tcopy_L2_M4_END:
.Ldgemm_tcopy_L2_M4_END:




/*********************************************************************************************/ /*********************************************************************************************/


dgemm_tcopy_L1_BEGIN:
.Ldgemm_tcopy_L1_BEGIN:


tst M, #1 tst M, #1
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999




dgemm_tcopy_L1_M4_BEGIN:
.Ldgemm_tcopy_L1_M4_BEGIN:


mov A01, A // A01 = A mov A01, A // A01 = A
mov B01, B mov B01, B


asr I, N, #2 // I = M / 4 asr I, N, #2 // I = M / 4
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L1_M4_40
ble .Ldgemm_tcopy_L1_M4_40


.align 5 .align 5
dgemm_tcopy_L1_M4_20:
.Ldgemm_tcopy_L1_M4_20:


COPY4x1 COPY4x1


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L1_M4_20
bne .Ldgemm_tcopy_L1_M4_20




dgemm_tcopy_L1_M4_40:
.Ldgemm_tcopy_L1_M4_40:


tst N , #2 tst N , #2
ble dgemm_tcopy_L1_M4_60
ble .Ldgemm_tcopy_L1_M4_60


COPY2x1 COPY2x1


dgemm_tcopy_L1_M4_60:
.Ldgemm_tcopy_L1_M4_60:


tst N , #1 tst N , #1
ble dgemm_tcopy_L1_M4_END
ble .Ldgemm_tcopy_L1_M4_END


COPY1x1 COPY1x1




dgemm_tcopy_L1_M4_END:
.Ldgemm_tcopy_L1_M4_END:




dgemm_tcopy_L999:
.Ldgemm_tcopy_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
RESTORE_REGS RESTORE_REGS
ret ret


+ 56
- 56
kernel/arm64/dgemm_tcopy_8.S View File

@@ -454,13 +454,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


lsl M8, M, #6 // M8 = M * 8 * SIZE lsl M8, M, #6 // M8 = M * 8 * SIZE


dgemm_tcopy_L8_BEGIN:
.Ldgemm_tcopy_L8_BEGIN:
asr J, M, #3 // J = M / 4 asr J, M, #3 // J = M / 4
cmp J, #0 cmp J, #0
ble dgemm_tcopy_L4_BEGIN
ble .Ldgemm_tcopy_L4_BEGIN


.align 5 .align 5
dgemm_tcopy_L8_M8_BEGIN:
.Ldgemm_tcopy_L8_M8_BEGIN:


mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
@@ -477,53 +477,53 @@ dgemm_tcopy_L8_M8_BEGIN:


asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L8_M8_40
ble .Ldgemm_tcopy_L8_M8_40


.align 5 .align 5
dgemm_tcopy_L8_M8_20:
.Ldgemm_tcopy_L8_M8_20:


COPY8x8 COPY8x8


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L8_M8_20
bne .Ldgemm_tcopy_L8_M8_20


dgemm_tcopy_L8_M8_40:
.Ldgemm_tcopy_L8_M8_40:
tst N , #4 tst N , #4
ble dgemm_tcopy_L8_M8_60
ble .Ldgemm_tcopy_L8_M8_60


COPY4x8 COPY4x8


dgemm_tcopy_L8_M8_60:
.Ldgemm_tcopy_L8_M8_60:


tst N , #2 tst N , #2
ble dgemm_tcopy_L8_M8_80
ble .Ldgemm_tcopy_L8_M8_80


COPY2x8 COPY2x8




dgemm_tcopy_L8_M8_80:
.Ldgemm_tcopy_L8_M8_80:


tst N, #1 tst N, #1
ble dgemm_tcopy_L8_M8_END
ble .Ldgemm_tcopy_L8_M8_END


COPY1x8 COPY1x8




dgemm_tcopy_L8_M8_END:
.Ldgemm_tcopy_L8_M8_END:


subs J , J, #1 // j-- subs J , J, #1 // j--
bne dgemm_tcopy_L8_M8_BEGIN
bne .Ldgemm_tcopy_L8_M8_BEGIN


/*********************************************************************************************/ /*********************************************************************************************/


dgemm_tcopy_L4_BEGIN:
.Ldgemm_tcopy_L4_BEGIN:
tst M, #7 tst M, #7
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999


tst M, #4 tst M, #4
ble dgemm_tcopy_L2_BEGIN
ble .Ldgemm_tcopy_L2_BEGIN


dgemm_tcopy_L4_M8_BEGIN:
.Ldgemm_tcopy_L4_M8_BEGIN:


mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
@@ -536,51 +536,51 @@ dgemm_tcopy_L4_M8_BEGIN:


asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L4_M8_40
ble .Ldgemm_tcopy_L4_M8_40


.align 5 .align 5
dgemm_tcopy_L4_M8_20:
.Ldgemm_tcopy_L4_M8_20:


COPY8x4 COPY8x4


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L4_M8_20
bne .Ldgemm_tcopy_L4_M8_20


dgemm_tcopy_L4_M8_40:
.Ldgemm_tcopy_L4_M8_40:
tst N , #4 tst N , #4
ble dgemm_tcopy_L4_M8_60
ble .Ldgemm_tcopy_L4_M8_60


COPY4x4 COPY4x4


dgemm_tcopy_L4_M8_60:
.Ldgemm_tcopy_L4_M8_60:


tst N , #2 tst N , #2
ble dgemm_tcopy_L4_M8_80
ble .Ldgemm_tcopy_L4_M8_80


COPY2x4 COPY2x4




dgemm_tcopy_L4_M8_80:
.Ldgemm_tcopy_L4_M8_80:


tst N, #1 tst N, #1
ble dgemm_tcopy_L4_M8_END
ble .Ldgemm_tcopy_L4_M8_END


COPY1x4 COPY1x4




dgemm_tcopy_L4_M8_END:
.Ldgemm_tcopy_L4_M8_END:


/*********************************************************************************************/ /*********************************************************************************************/


dgemm_tcopy_L2_BEGIN:
.Ldgemm_tcopy_L2_BEGIN:


tst M, #3 tst M, #3
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999


tst M, #2 tst M, #2
ble dgemm_tcopy_L1_BEGIN
ble .Ldgemm_tcopy_L1_BEGIN


dgemm_tcopy_L2_M8_BEGIN:
.Ldgemm_tcopy_L2_M8_BEGIN:
mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
add A, A02, LDA add A, A02, LDA
@@ -590,90 +590,90 @@ dgemm_tcopy_L2_M8_BEGIN:


asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L2_M8_40
ble .Ldgemm_tcopy_L2_M8_40


.align 5 .align 5
dgemm_tcopy_L2_M8_20:
.Ldgemm_tcopy_L2_M8_20:


COPY8x2 COPY8x2


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L2_M8_20
bne .Ldgemm_tcopy_L2_M8_20


dgemm_tcopy_L2_M8_40:
.Ldgemm_tcopy_L2_M8_40:
tst N , #4 tst N , #4
ble dgemm_tcopy_L2_M8_60
ble .Ldgemm_tcopy_L2_M8_60


COPY4x2 COPY4x2


dgemm_tcopy_L2_M8_60:
.Ldgemm_tcopy_L2_M8_60:


tst N , #2 tst N , #2
ble dgemm_tcopy_L2_M8_80
ble .Ldgemm_tcopy_L2_M8_80


COPY2x2 COPY2x2


dgemm_tcopy_L2_M8_80:
.Ldgemm_tcopy_L2_M8_80:


tst N , #1 tst N , #1
ble dgemm_tcopy_L2_M8_END
ble .Ldgemm_tcopy_L2_M8_END


COPY1x2 COPY1x2




dgemm_tcopy_L2_M8_END:
.Ldgemm_tcopy_L2_M8_END:




/*********************************************************************************************/ /*********************************************************************************************/


dgemm_tcopy_L1_BEGIN:
.Ldgemm_tcopy_L1_BEGIN:


tst M, #1 tst M, #1
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999




dgemm_tcopy_L1_M8_BEGIN:
.Ldgemm_tcopy_L1_M8_BEGIN:


mov A01, A // A01 = A mov A01, A // A01 = A
mov B01, B mov B01, B


asr I, N, #3 // I = M / 8 asr I, N, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L1_M8_40
ble .Ldgemm_tcopy_L1_M8_40


.align 5 .align 5
dgemm_tcopy_L1_M8_20:
.Ldgemm_tcopy_L1_M8_20:


COPY8x1 COPY8x1


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L1_M8_20
bne .Ldgemm_tcopy_L1_M8_20


dgemm_tcopy_L1_M8_40:
.Ldgemm_tcopy_L1_M8_40:
tst N , #4 tst N , #4
ble dgemm_tcopy_L1_M8_60
ble .Ldgemm_tcopy_L1_M8_60


COPY4x1 COPY4x1


dgemm_tcopy_L1_M8_60:
.Ldgemm_tcopy_L1_M8_60:


tst N , #2 tst N , #2
ble dgemm_tcopy_L1_M8_80
ble .Ldgemm_tcopy_L1_M8_80


COPY2x1 COPY2x1


dgemm_tcopy_L1_M8_80:
.Ldgemm_tcopy_L1_M8_80:


tst N , #1 tst N , #1
ble dgemm_tcopy_L1_M8_END
ble .Ldgemm_tcopy_L1_M8_END


COPY1x1 COPY1x1




dgemm_tcopy_L1_M8_END:
.Ldgemm_tcopy_L1_M8_END:




dgemm_tcopy_L999:
.Ldgemm_tcopy_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
RESTORE_REGS RESTORE_REGS
ret ret


+ 20
- 20
kernel/arm64/dot.S View File

@@ -154,51 +154,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif


cmp N, xzr cmp N, xzr
ble dot_kernel_L999
ble .Ldot_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne dot_kernel_S_BEGIN
bne .Ldot_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne dot_kernel_S_BEGIN
bne .Ldot_kernel_S_BEGIN


dot_kernel_F_BEGIN:
.Ldot_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq dot_kernel_F1
beq .Ldot_kernel_F1


dot_kernel_F4:
.Ldot_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne dot_kernel_F4
bne .Ldot_kernel_F4


KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE


dot_kernel_F1:
.Ldot_kernel_F1:


ands I, N, #3 ands I, N, #3
ble dot_kernel_L999
ble .Ldot_kernel_L999


dot_kernel_F10:
.Ldot_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne dot_kernel_F10
bne .Ldot_kernel_F10


ret ret


dot_kernel_S_BEGIN:
.Ldot_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble dot_kernel_S1
ble .Ldot_kernel_S1


dot_kernel_S4:
.Ldot_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -206,21 +206,21 @@ dot_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne dot_kernel_S4
bne .Ldot_kernel_S4


dot_kernel_S1:
.Ldot_kernel_S1:


ands I, N, #3 ands I, N, #3
ble dot_kernel_L999
ble .Ldot_kernel_L999


dot_kernel_S10:
.Ldot_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne dot_kernel_S10
bne .Ldot_kernel_S10


dot_kernel_L999:
.Ldot_kernel_L999:


ret ret




+ 129
- 129
kernel/arm64/dtrmm_kernel_4x4.S View File

@@ -549,11 +549,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2


@@ -563,14 +563,14 @@ dtrmm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN


dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -591,57 +591,57 @@ dtrmm_kernel_L4_M4_20:


asr counterL , tempK, #1 // L = K / 2 asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M4_32
blt .Ldtrmm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble dtrmm_kernel_L4_M4_22a
ble .Ldtrmm_kernel_L4_M4_22a
.align 5 .align 5


dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22




dtrmm_kernel_L4_M4_22a:
.Ldtrmm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44


dtrmm_kernel_L4_M4_32:
.Ldtrmm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I


KERNEL4x4_E KERNEL4x4_E


b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44




dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:


INIT4x4 INIT4x4


dtrmm_kernel_L4_M4_44:
.Ldtrmm_kernel_L4_M4_44:


ands counterL , tempK, #1 ands counterL , tempK, #1
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100


dtrmm_kernel_L4_M4_46:
.Ldtrmm_kernel_L4_M4_46:


KERNEL4x4_SUB KERNEL4x4_SUB


dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


@@ -660,20 +660,20 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dtrmm_kernel_L4_M4_20
bne .Ldtrmm_kernel_L4_M4_20


dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN


dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -697,9 +697,9 @@ dtrmm_kernel_L4_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40


dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -712,22 +712,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22




dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100


dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42


dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -747,15 +747,15 @@ dtrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:




dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -779,9 +779,9 @@ dtrmm_kernel_L4_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40


dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -793,22 +793,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22




dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100


dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42


dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -828,7 +828,7 @@ dtrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@@ -838,19 +838,19 @@ dtrmm_kernel_L4_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L4_BEGIN
bgt .Ldtrmm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -863,14 +863,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A




dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN


dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -894,10 +894,10 @@ dtrmm_kernel_L2_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5 .align 5


dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -909,22 +909,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22




dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100


dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42


dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


@@ -944,22 +944,22 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M4_20
bgt .Ldtrmm_kernel_L2_M4_20




dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN


dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -983,9 +983,9 @@ dtrmm_kernel_L2_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40


dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -998,22 +998,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22




dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100


dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42


dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


@@ -1033,15 +1033,15 @@ dtrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:




dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1065,9 +1065,9 @@ dtrmm_kernel_L2_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40


dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1079,22 +1079,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22




dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100


dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42


dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -1114,7 +1114,7 @@ dtrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -1122,11 +1122,11 @@ dtrmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1138,14 +1138,14 @@ dtrmm_kernel_L1_BEGIN:


mov pA, origPA // pA = A mov pA, origPA // pA = A


dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN


dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -1169,10 +1169,10 @@ dtrmm_kernel_L1_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5 .align 5


dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1184,22 +1184,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22




dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100


dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42


dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


@@ -1220,22 +1220,22 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M4_20
bgt .Ldtrmm_kernel_L1_M4_20




dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN


dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1259,9 +1259,9 @@ dtrmm_kernel_L1_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40


dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1274,22 +1274,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22




dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100


dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42


dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -1309,15 +1309,15 @@ dtrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:




dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1341,9 +1341,9 @@ dtrmm_kernel_L1_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40


dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1355,30 +1355,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22




dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100


dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42


dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:




dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 176
- 176
kernel/arm64/dtrmm_kernel_4x8.S View File

@@ -900,11 +900,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8 asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0 cmp counterJ, #0
ble dtrmm_kernel_L4_BEGIN
ble .Ldtrmm_kernel_L4_BEGIN


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L8_BEGIN:
.Ldtrmm_kernel_L8_BEGIN:


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3 add pC, pC, LDC, lsl #3
@@ -915,14 +915,14 @@ dtrmm_kernel_L8_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dtrmm_kernel_L8_M4_BEGIN:
.Ldtrmm_kernel_L8_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L8_M2_BEGIN
ble .Ldtrmm_kernel_L8_M2_BEGIN


dtrmm_kernel_L8_M4_20:
.Ldtrmm_kernel_L8_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -944,57 +944,57 @@ dtrmm_kernel_L8_M4_20:


asr counterL, tempK, #1 // L = K / 2 asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L8_M4_32
blt .Ldtrmm_kernel_L8_M4_32


KERNEL4x8_I // do one in the K KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K KERNEL4x8_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble dtrmm_kernel_L8_M4_22a
ble .Ldtrmm_kernel_L8_M4_22a
.align 5 .align 5


dtrmm_kernel_L8_M4_22:
.Ldtrmm_kernel_L8_M4_22:


KERNEL4x8_M1 KERNEL4x8_M1
KERNEL4x8_M2 KERNEL4x8_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M4_22
bgt .Ldtrmm_kernel_L8_M4_22




dtrmm_kernel_L8_M4_22a:
.Ldtrmm_kernel_L8_M4_22a:


KERNEL4x8_M1 KERNEL4x8_M1
KERNEL4x8_E KERNEL4x8_E


b dtrmm_kernel_L8_M4_44
b .Ldtrmm_kernel_L8_M4_44


dtrmm_kernel_L8_M4_32:
.Ldtrmm_kernel_L8_M4_32:


tst counterL, #1 tst counterL, #1
ble dtrmm_kernel_L8_M4_40
ble .Ldtrmm_kernel_L8_M4_40


KERNEL4x8_I KERNEL4x8_I


KERNEL4x8_E KERNEL4x8_E


b dtrmm_kernel_L8_M4_44
b .Ldtrmm_kernel_L8_M4_44




dtrmm_kernel_L8_M4_40:
.Ldtrmm_kernel_L8_M4_40:


INIT4x8 INIT4x8


dtrmm_kernel_L8_M4_44:
.Ldtrmm_kernel_L8_M4_44:


ands counterL, tempK, #1 ands counterL, tempK, #1
ble dtrmm_kernel_L8_M4_100
ble .Ldtrmm_kernel_L8_M4_100


dtrmm_kernel_L8_M4_46:
.Ldtrmm_kernel_L8_M4_46:


KERNEL4x8_SUB KERNEL4x8_SUB


dtrmm_kernel_L8_M4_100:
.Ldtrmm_kernel_L8_M4_100:


SAVE4x8 SAVE4x8


@@ -1014,20 +1014,20 @@ dtrmm_kernel_L8_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L8_M4_END:
.Ldtrmm_kernel_L8_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dtrmm_kernel_L8_M4_20
bne .Ldtrmm_kernel_L8_M4_20


dtrmm_kernel_L8_M2_BEGIN:
.Ldtrmm_kernel_L8_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L8_END
ble .Ldtrmm_kernel_L8_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L8_M1_BEGIN
ble .Ldtrmm_kernel_L8_M1_BEGIN


dtrmm_kernel_L8_M2_20:
.Ldtrmm_kernel_L8_M2_20:


INIT2x8 INIT2x8


@@ -1051,9 +1051,9 @@ dtrmm_kernel_L8_M2_20:


asr counterL, tempK, #3 // counterL = counterL / 8 asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L8_M2_40
ble .Ldtrmm_kernel_L8_M2_40


dtrmm_kernel_L8_M2_22:
.Ldtrmm_kernel_L8_M2_22:


KERNEL2x8_SUB KERNEL2x8_SUB
KERNEL2x8_SUB KERNEL2x8_SUB
@@ -1066,22 +1066,22 @@ dtrmm_kernel_L8_M2_22:
KERNEL2x8_SUB KERNEL2x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M2_22
bgt .Ldtrmm_kernel_L8_M2_22




dtrmm_kernel_L8_M2_40:
.Ldtrmm_kernel_L8_M2_40:


ands counterL, tempK, #7 // counterL = counterL % 8 ands counterL, tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L8_M2_100
ble .Ldtrmm_kernel_L8_M2_100


dtrmm_kernel_L8_M2_42:
.Ldtrmm_kernel_L8_M2_42:


KERNEL2x8_SUB KERNEL2x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M2_42
bgt .Ldtrmm_kernel_L8_M2_42


dtrmm_kernel_L8_M2_100:
.Ldtrmm_kernel_L8_M2_100:


SAVE2x8 SAVE2x8


@@ -1102,15 +1102,15 @@ dtrmm_kernel_L8_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L8_M2_END:
.Ldtrmm_kernel_L8_M2_END:




dtrmm_kernel_L8_M1_BEGIN:
.Ldtrmm_kernel_L8_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L8_END
ble .Ldtrmm_kernel_L8_END


dtrmm_kernel_L8_M1_20:
.Ldtrmm_kernel_L8_M1_20:


INIT1x8 INIT1x8


@@ -1134,9 +1134,9 @@ dtrmm_kernel_L8_M1_20:


asr counterL, tempK, #3 // counterL = counterL / 8 asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L8_M1_40
ble .Ldtrmm_kernel_L8_M1_40


dtrmm_kernel_L8_M1_22:
.Ldtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB KERNEL1x8_SUB
KERNEL1x8_SUB KERNEL1x8_SUB
KERNEL1x8_SUB KERNEL1x8_SUB
@@ -1148,22 +1148,22 @@ dtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB KERNEL1x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M1_22
bgt .Ldtrmm_kernel_L8_M1_22




dtrmm_kernel_L8_M1_40:
.Ldtrmm_kernel_L8_M1_40:


ands counterL, tempK, #7 // counterL = counterL % 8 ands counterL, tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L8_M1_100
ble .Ldtrmm_kernel_L8_M1_100


dtrmm_kernel_L8_M1_42:
.Ldtrmm_kernel_L8_M1_42:


KERNEL1x8_SUB KERNEL1x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M1_42
bgt .Ldtrmm_kernel_L8_M1_42


dtrmm_kernel_L8_M1_100:
.Ldtrmm_kernel_L8_M1_100:


SAVE1x8 SAVE1x8


@@ -1183,7 +1183,7 @@ dtrmm_kernel_L8_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


dtrmm_kernel_L8_END:
.Ldtrmm_kernel_L8_END:


lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8 add origPB, origPB, temp // B = B + K * 8 * 8
@@ -1193,19 +1193,19 @@ dtrmm_kernel_L8_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L8_BEGIN
bgt .Ldtrmm_kernel_L8_BEGIN




/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #7 tst counterJ , #7
ble dtrmm_kernel_L999
ble .Ldtrmm_kernel_L999


tst counterJ , #4 tst counterJ , #4
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2
@@ -1216,14 +1216,14 @@ dtrmm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN


dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -1244,57 +1244,57 @@ dtrmm_kernel_L4_M4_20:


asr counterL, tempK, #1 // L = K / 2 asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M4_32
blt .Ldtrmm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble dtrmm_kernel_L4_M4_22a
ble .Ldtrmm_kernel_L4_M4_22a
.align 5 .align 5


dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22




dtrmm_kernel_L4_M4_22a:
.Ldtrmm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44


dtrmm_kernel_L4_M4_32:
.Ldtrmm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I


KERNEL4x4_E KERNEL4x4_E


b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44




dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:


INIT4x4 INIT4x4


dtrmm_kernel_L4_M4_44:
.Ldtrmm_kernel_L4_M4_44:


ands counterL , tempK, #1 ands counterL , tempK, #1
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100


dtrmm_kernel_L4_M4_46:
.Ldtrmm_kernel_L4_M4_46:


KERNEL4x4_SUB KERNEL4x4_SUB


dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1312,20 +1312,20 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dtrmm_kernel_L4_M4_20
bne .Ldtrmm_kernel_L4_M4_20


dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN


dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1348,9 +1348,9 @@ dtrmm_kernel_L4_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40


dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1363,22 +1363,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22




dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100


dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42


dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -1397,15 +1397,15 @@ dtrmm_kernel_L4_M2_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:




dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1428,9 +1428,9 @@ dtrmm_kernel_L4_M1_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40


dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1442,22 +1442,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22




dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100


dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42


dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -1476,7 +1476,7 @@ dtrmm_kernel_L4_M1_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@@ -1486,14 +1486,14 @@ dtrmm_kernel_L4_END:


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1505,14 +1505,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A




dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN


dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1535,10 +1535,10 @@ dtrmm_kernel_L2_M4_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5 .align 5


dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1550,22 +1550,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22




dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100


dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42


dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1584,22 +1584,22 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M4_20
bgt .Ldtrmm_kernel_L2_M4_20




dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN


dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1622,9 +1622,9 @@ dtrmm_kernel_L2_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40


dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1637,22 +1637,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22




dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100


dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42


dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


@@ -1671,15 +1671,15 @@ dtrmm_kernel_L2_M2_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:




dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1702,9 +1702,9 @@ dtrmm_kernel_L2_M1_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40


dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1716,22 +1716,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22




dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100


dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42


dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -1750,7 +1750,7 @@ dtrmm_kernel_L2_M1_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -1758,11 +1758,11 @@ dtrmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1773,14 +1773,14 @@ dtrmm_kernel_L1_BEGIN:
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A


dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN


dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -1802,10 +1802,10 @@ dtrmm_kernel_L1_M4_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5 .align 5


dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1817,22 +1817,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22




dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100


dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42


dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1851,22 +1851,22 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M4_20
bgt .Ldtrmm_kernel_L1_M4_20




dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN


dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1889,9 +1889,9 @@ dtrmm_kernel_L1_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40


dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1904,22 +1904,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22




dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100


dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42


dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -1938,15 +1938,15 @@ dtrmm_kernel_L1_M2_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:




dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1969,9 +1969,9 @@ dtrmm_kernel_L1_M1_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40


dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1983,30 +1983,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22




dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100


dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42


dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:




dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 169
- 169
kernel/arm64/dtrmm_kernel_8x4.S View File

@@ -829,11 +829,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -847,15 +847,15 @@ dtrmm_kernel_L4_BEGIN:
#endif #endif
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dtrmm_kernel_L4_M8_BEGIN:
.Ldtrmm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L4_M4_BEGIN
ble .Ldtrmm_kernel_L4_M4_BEGIN


.align 5 .align 5
dtrmm_kernel_L4_M8_20:
.Ldtrmm_kernel_L4_M8_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -877,7 +877,7 @@ dtrmm_kernel_L4_M8_20:


asr counterL , tempK, #3 // L = K / 8 asr counterL , tempK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M8_32
blt .Ldtrmm_kernel_L4_M8_32


KERNEL8x4_I // do one in the K KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K KERNEL8x4_M2 // do another in the K
@@ -889,10 +889,10 @@ dtrmm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dtrmm_kernel_L4_M8_22a
ble .Ldtrmm_kernel_L4_M8_22a


.align 5 .align 5
dtrmm_kernel_L4_M8_22:
.Ldtrmm_kernel_L4_M8_22:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -904,10 +904,10 @@ dtrmm_kernel_L4_M8_22:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M8_22
bgt .Ldtrmm_kernel_L4_M8_22


.align 5 .align 5
dtrmm_kernel_L4_M8_22a:
.Ldtrmm_kernel_L4_M8_22a:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -918,13 +918,13 @@ dtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dtrmm_kernel_L4_M8_44
b .Ldtrmm_kernel_L4_M8_44


.align 5 .align 5
dtrmm_kernel_L4_M8_32:
.Ldtrmm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble dtrmm_kernel_L4_M8_40
ble .Ldtrmm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -935,26 +935,26 @@ dtrmm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dtrmm_kernel_L4_M8_44
b .Ldtrmm_kernel_L4_M8_44


dtrmm_kernel_L4_M8_40:
.Ldtrmm_kernel_L4_M8_40:


INIT8x4 INIT8x4


dtrmm_kernel_L4_M8_44:
.Ldtrmm_kernel_L4_M8_44:


ands counterL , tempK, #7 ands counterL , tempK, #7
ble dtrmm_kernel_L4_M8_100
ble .Ldtrmm_kernel_L4_M8_100


.align 5 .align 5
dtrmm_kernel_L4_M8_46:
.Ldtrmm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne dtrmm_kernel_L4_M8_46
bne .Ldtrmm_kernel_L4_M8_46


dtrmm_kernel_L4_M8_100:
.Ldtrmm_kernel_L4_M8_100:


SAVE8x4 SAVE8x4


@@ -977,20 +977,20 @@ dtrmm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


dtrmm_kernel_L4_M8_END:
.Ldtrmm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dtrmm_kernel_L4_M8_20
bne .Ldtrmm_kernel_L4_M8_20


dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN


dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:


INIT4x4 INIT4x4


@@ -1013,9 +1013,9 @@ dtrmm_kernel_L4_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40


dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:


KERNEL4x4_SUB KERNEL4x4_SUB
KERNEL4x4_SUB KERNEL4x4_SUB
@@ -1028,22 +1028,22 @@ dtrmm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22




dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100


dtrmm_kernel_L4_M4_42:
.Ldtrmm_kernel_L4_M4_42:


KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_42
bgt .Ldtrmm_kernel_L4_M4_42


dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


@@ -1062,19 +1062,19 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:




dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN


dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1097,9 +1097,9 @@ dtrmm_kernel_L4_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40


dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1112,22 +1112,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22




dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100


dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42


dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -1147,15 +1147,15 @@ dtrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:




dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1179,9 +1179,9 @@ dtrmm_kernel_L4_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40


dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1193,22 +1193,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22




dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100


dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42


dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -1228,7 +1228,7 @@ dtrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@@ -1238,19 +1238,19 @@ dtrmm_kernel_L4_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L4_BEGIN
bgt .Ldtrmm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1261,14 +1261,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A


dtrmm_kernel_L2_M8_BEGIN:
.Ldtrmm_kernel_L2_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L2_M4_BEGIN
ble .Ldtrmm_kernel_L2_M4_BEGIN


dtrmm_kernel_L2_M8_20:
.Ldtrmm_kernel_L2_M8_20:


INIT8x2 INIT8x2


@@ -1292,10 +1292,10 @@ dtrmm_kernel_L2_M8_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M8_40
ble .Ldtrmm_kernel_L2_M8_40
.align 5 .align 5


dtrmm_kernel_L2_M8_22:
.Ldtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
@@ -1307,22 +1307,22 @@ dtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M8_22
bgt .Ldtrmm_kernel_L2_M8_22




dtrmm_kernel_L2_M8_40:
.Ldtrmm_kernel_L2_M8_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M8_100
ble .Ldtrmm_kernel_L2_M8_100


dtrmm_kernel_L2_M8_42:
.Ldtrmm_kernel_L2_M8_42:


KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M8_42
bgt .Ldtrmm_kernel_L2_M8_42


dtrmm_kernel_L2_M8_100:
.Ldtrmm_kernel_L2_M8_100:


SAVE8x2 SAVE8x2


@@ -1342,21 +1342,21 @@ dtrmm_kernel_L2_M8_100:
add tempOffset, tempOffset, #8 add tempOffset, tempOffset, #8
#endif #endif


dtrmm_kernel_L2_M8_END:
.Ldtrmm_kernel_L2_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M8_20
bgt .Ldtrmm_kernel_L2_M8_20


dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN


dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1380,10 +1380,10 @@ dtrmm_kernel_L2_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5 .align 5


dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1395,22 +1395,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22




dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100


dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42


dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


@@ -1430,19 +1430,19 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:




dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN


dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1466,9 +1466,9 @@ dtrmm_kernel_L2_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40


dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1481,22 +1481,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22




dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100


dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42


dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


@@ -1516,15 +1516,15 @@ dtrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:




dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1548,9 +1548,9 @@ dtrmm_kernel_L2_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40


dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1562,22 +1562,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22




dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100


dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42


dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -1597,7 +1597,7 @@ dtrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -1605,11 +1605,11 @@ dtrmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next add pC , pC , LDC // Update pC to point to next
@@ -1619,14 +1619,14 @@ dtrmm_kernel_L1_BEGIN:
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A


dtrmm_kernel_L1_M8_BEGIN:
.Ldtrmm_kernel_L1_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L1_M4_BEGIN
ble .Ldtrmm_kernel_L1_M4_BEGIN


dtrmm_kernel_L1_M8_20:
.Ldtrmm_kernel_L1_M8_20:


INIT8x1 INIT8x1


@@ -1650,10 +1650,10 @@ dtrmm_kernel_L1_M8_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M8_40
ble .Ldtrmm_kernel_L1_M8_40
.align 5 .align 5


dtrmm_kernel_L1_M8_22:
.Ldtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@@ -1665,22 +1665,22 @@ dtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M8_22
bgt .Ldtrmm_kernel_L1_M8_22




dtrmm_kernel_L1_M8_40:
.Ldtrmm_kernel_L1_M8_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M8_100
ble .Ldtrmm_kernel_L1_M8_100


dtrmm_kernel_L1_M8_42:
.Ldtrmm_kernel_L1_M8_42:


KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M8_42
bgt .Ldtrmm_kernel_L1_M8_42


dtrmm_kernel_L1_M8_100:
.Ldtrmm_kernel_L1_M8_100:


SAVE8x1 SAVE8x1


@@ -1700,21 +1700,21 @@ dtrmm_kernel_L1_M8_100:
add tempOffset, tempOffset, #8 add tempOffset, tempOffset, #8
#endif #endif


dtrmm_kernel_L1_M8_END:
.Ldtrmm_kernel_L1_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M8_20
bgt .Ldtrmm_kernel_L1_M8_20


dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN


dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -1737,10 +1737,10 @@ dtrmm_kernel_L1_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5 .align 5


dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1752,22 +1752,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22




dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100


dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42


dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


@@ -1787,18 +1787,18 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:


dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN


dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1822,9 +1822,9 @@ dtrmm_kernel_L1_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40


dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1837,22 +1837,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22




dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100


dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42


dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -1872,15 +1872,15 @@ dtrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:




dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1904,9 +1904,9 @@ dtrmm_kernel_L1_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40


dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1918,30 +1918,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22




dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100


dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42


dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:




dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 31
- 31
kernel/arm64/gemv_n.S View File

@@ -203,18 +203,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS SAVE_REGS


cmp N, xzr cmp N, xzr
ble gemv_n_kernel_L999
ble .Lgemv_n_kernel_L999
cmp M, xzr cmp M, xzr
ble gemv_n_kernel_L999
ble .Lgemv_n_kernel_L999


lsl LDA, LDA, #SHZ lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ lsl INC_X, INC_X, #SHZ
mov J, N mov J, N


cmp INC_Y, #1 cmp INC_Y, #1
bne gemv_n_kernel_S_BEGIN
bne .Lgemv_n_kernel_S_BEGIN


gemv_n_kernel_F_LOOP:
.Lgemv_n_kernel_F_LOOP:


ld1 TEMPV, [X], INC_X ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP fmul TEMP, ALPHA, TEMP
@@ -229,57 +229,57 @@ gemv_n_kernel_F_LOOP:
mov Y_IPTR, Y mov Y_IPTR, Y
mov Y_OPTR, Y mov Y_OPTR, Y


gemv_n_kernel_F32:
.Lgemv_n_kernel_F32:


asr I, M, #5 asr I, M, #5
cmp I, xzr cmp I, xzr
beq gemv_n_kernel_F4
beq .Lgemv_n_kernel_F4


gemv_n_kernel_F320:
.Lgemv_n_kernel_F320:


KERNEL_F16 KERNEL_F16
KERNEL_F16 KERNEL_F16


subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_F320
bne .Lgemv_n_kernel_F320


gemv_n_kernel_F4:
.Lgemv_n_kernel_F4:
ands I, M, #31 ands I, M, #31
asr I, I, #2 asr I, I, #2
cmp I, xzr cmp I, xzr
beq gemv_n_kernel_F1
beq .Lgemv_n_kernel_F1


gemv_n_kernel_F40:
.Lgemv_n_kernel_F40:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_F40
bne .Lgemv_n_kernel_F40


gemv_n_kernel_F1:
.Lgemv_n_kernel_F1:
ands I, M, #3 ands I, M, #3
ble gemv_n_kernel_F_END
ble .Lgemv_n_kernel_F_END


gemv_n_kernel_F10:
.Lgemv_n_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_F10
bne .Lgemv_n_kernel_F10


gemv_n_kernel_F_END:
.Lgemv_n_kernel_F_END:


add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne gemv_n_kernel_F_LOOP
bne .Lgemv_n_kernel_F_LOOP


b gemv_n_kernel_L999
b .Lgemv_n_kernel_L999


gemv_n_kernel_S_BEGIN:
.Lgemv_n_kernel_S_BEGIN:


INIT_S INIT_S


gemv_n_kernel_S_LOOP:
.Lgemv_n_kernel_S_LOOP:


ld1 TEMPV, [X], INC_X ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP fmul TEMP, ALPHA, TEMP
@@ -288,9 +288,9 @@ gemv_n_kernel_S_LOOP:


asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
ble gemv_n_kernel_S1
ble .Lgemv_n_kernel_S1


gemv_n_kernel_S4:
.Lgemv_n_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -298,27 +298,27 @@ gemv_n_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_S4
bne .Lgemv_n_kernel_S4


gemv_n_kernel_S1:
.Lgemv_n_kernel_S1:


ands I, M, #3 ands I, M, #3
ble gemv_n_kernel_S_END
ble .Lgemv_n_kernel_S_END


gemv_n_kernel_S10:
.Lgemv_n_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_S10
bne .Lgemv_n_kernel_S10


gemv_n_kernel_S_END:
.Lgemv_n_kernel_S_END:


add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne gemv_n_kernel_S_LOOP
bne .Lgemv_n_kernel_S_LOOP


gemv_n_kernel_L999:
.Lgemv_n_kernel_L999:


mov w0, wzr mov w0, wzr




+ 31
- 31
kernel/arm64/gemv_t.S View File

@@ -233,18 +233,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS SAVE_REGS


cmp N, xzr cmp N, xzr
ble gemv_t_kernel_L999
ble .Lgemv_t_kernel_L999
cmp M, xzr cmp M, xzr
ble gemv_t_kernel_L999
ble .Lgemv_t_kernel_L999


lsl LDA, LDA, #SHZ lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ lsl INC_Y, INC_Y, #SHZ
mov J, N mov J, N


cmp INC_X, #1 cmp INC_X, #1
bne gemv_t_kernel_S_BEGIN
bne .Lgemv_t_kernel_S_BEGIN


gemv_t_kernel_F_LOOP:
.Lgemv_t_kernel_F_LOOP:


fmov TEMP, REG0 fmov TEMP, REG0
fmov TEMP1, REG0 fmov TEMP1, REG0
@@ -254,64 +254,64 @@ gemv_t_kernel_F_LOOP:
mov A_PTR, A mov A_PTR, A
mov X_PTR, X mov X_PTR, X


gemv_t_kernel_F32:
.Lgemv_t_kernel_F32:


asr I, M, #5 asr I, M, #5
cmp I, xzr cmp I, xzr
beq gemv_t_kernel_F4
beq .Lgemv_t_kernel_F4


gemv_t_kernel_F320:
.Lgemv_t_kernel_F320:


KERNEL_F32 KERNEL_F32


subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_F320
bne .Lgemv_t_kernel_F320


KERNEL_F32_FINALIZE KERNEL_F32_FINALIZE


gemv_t_kernel_F4:
.Lgemv_t_kernel_F4:
ands I, M, #31 ands I, M, #31
asr I, I, #2 asr I, I, #2
cmp I, xzr cmp I, xzr
beq gemv_t_kernel_F1
beq .Lgemv_t_kernel_F1


gemv_t_kernel_F40:
.Lgemv_t_kernel_F40:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_F40
bne .Lgemv_t_kernel_F40


gemv_t_kernel_F1:
.Lgemv_t_kernel_F1:


KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE


ands I, M, #3 ands I, M, #3
ble gemv_t_kernel_F_END
ble .Lgemv_t_kernel_F_END


gemv_t_kernel_F10:
.Lgemv_t_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_F10
bne .Lgemv_t_kernel_F10


gemv_t_kernel_F_END:
.Lgemv_t_kernel_F_END:


ld1 TMPV1, [Y] ld1 TMPV1, [Y]
add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1 fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_F_LOOP
bne .Lgemv_t_kernel_F_LOOP


b gemv_t_kernel_L999
b .Lgemv_t_kernel_L999


gemv_t_kernel_S_BEGIN:
.Lgemv_t_kernel_S_BEGIN:


INIT_S INIT_S


gemv_t_kernel_S_LOOP:
.Lgemv_t_kernel_S_LOOP:


fmov TEMP, REG0 fmov TEMP, REG0
mov A_PTR, A mov A_PTR, A
@@ -319,9 +319,9 @@ gemv_t_kernel_S_LOOP:


asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
ble gemv_t_kernel_S1
ble .Lgemv_t_kernel_S1


gemv_t_kernel_S4:
.Lgemv_t_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -329,30 +329,30 @@ gemv_t_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_S4
bne .Lgemv_t_kernel_S4


gemv_t_kernel_S1:
.Lgemv_t_kernel_S1:


ands I, M, #3 ands I, M, #3
ble gemv_t_kernel_S_END
ble .Lgemv_t_kernel_S_END


gemv_t_kernel_S10:
.Lgemv_t_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_S10
bne .Lgemv_t_kernel_S10


gemv_t_kernel_S_END:
.Lgemv_t_kernel_S_END:


ld1 TMPV1, [Y] ld1 TMPV1, [Y]
add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1 fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_S_LOOP
bne .Lgemv_t_kernel_S_LOOP


gemv_t_kernel_L999:
.Lgemv_t_kernel_L999:


RESTORE_REGS RESTORE_REGS




+ 24
- 24
kernel/arm64/iamax.S View File

@@ -230,62 +230,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble iamax_kernel_zero
ble .Liamax_kernel_zero
cmp INC_X, xzr cmp INC_X, xzr
ble iamax_kernel_zero
ble .Liamax_kernel_zero


cmp INC_X, #1 cmp INC_X, #1
bne iamax_kernel_S_BEGIN
bne .Liamax_kernel_S_BEGIN
mov x7, X mov x7, X


iamax_kernel_F_BEGIN:
.Liamax_kernel_F_BEGIN:


INIT_S INIT_S


subs N, N, #1 subs N, N, #1
ble iamax_kernel_L999
ble .Liamax_kernel_L999


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq iamax_kernel_F1
beq .Liamax_kernel_F1


add Z, Z, #1 add Z, Z, #1
iamax_kernel_F8:
.Liamax_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne iamax_kernel_F8
bne .Liamax_kernel_F8


KERNEL_F8_FINALIZE KERNEL_F8_FINALIZE


sub Z, Z, #1 sub Z, Z, #1
iamax_kernel_F1:
.Liamax_kernel_F1:


ands I, N, #7 ands I, N, #7
ble iamax_kernel_L999
ble .Liamax_kernel_L999


iamax_kernel_F10:
.Liamax_kernel_F10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne iamax_kernel_F10
bne .Liamax_kernel_F10


b iamax_kernel_L999
b .Liamax_kernel_L999


iamax_kernel_S_BEGIN:
.Liamax_kernel_S_BEGIN:


INIT_S INIT_S


subs N, N, #1 subs N, N, #1
ble iamax_kernel_L999
ble .Liamax_kernel_L999


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble iamax_kernel_S1
ble .Liamax_kernel_S1


iamax_kernel_S4:
.Liamax_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -293,25 +293,25 @@ iamax_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne iamax_kernel_S4
bne .Liamax_kernel_S4


iamax_kernel_S1:
.Liamax_kernel_S1:


ands I, N, #3 ands I, N, #3
ble iamax_kernel_L999
ble .Liamax_kernel_L999


iamax_kernel_S10:
.Liamax_kernel_S10:


KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne iamax_kernel_S10
bne .Liamax_kernel_S10


iamax_kernel_L999:
.Liamax_kernel_L999:


mov x0, INDEX mov x0, INDEX
ret ret


iamax_kernel_zero:
.Liamax_kernel_zero:


mov x0, xzr mov x0, xzr
ret ret


+ 24
- 24
kernel/arm64/izamax.S View File

@@ -276,64 +276,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble iamax_kernel_zero
ble .Lizamax_kernel_zero
cmp INC_X, xzr cmp INC_X, xzr
ble iamax_kernel_zero
ble .Lizamax_kernel_zero


cmp INC_X, #1 cmp INC_X, #1
bne iamax_kernel_S_BEGIN
bne .Lizamax_kernel_S_BEGIN
mov x7, X mov x7, X




iamax_kernel_F_BEGIN:
.Lizamax_kernel_F_BEGIN:


INIT_S INIT_S


subs N, N, #1 subs N, N, #1
ble iamax_kernel_L999
ble .Lizamax_kernel_L999


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
ble iamax_kernel_F1
ble .Lizamax_kernel_F1


add Z, Z, #1 add Z, Z, #1


iamax_kernel_F8:
.Lizamax_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne iamax_kernel_F8
bne .Lizamax_kernel_F8


KERNEL_F8_FINALIZE KERNEL_F8_FINALIZE


sub Z, Z, #1 sub Z, Z, #1
iamax_kernel_F1:
.Lizamax_kernel_F1:


ands I, N, #7 ands I, N, #7
ble iamax_kernel_L999
ble .Lizamax_kernel_L999


iamax_kernel_F10:
.Lizamax_kernel_F10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne iamax_kernel_F10
bne .Lizamax_kernel_F10


b iamax_kernel_L999
b .Lizamax_kernel_L999


iamax_kernel_S_BEGIN:
.Lizamax_kernel_S_BEGIN:


INIT_S INIT_S


subs N, N, #1 subs N, N, #1
ble iamax_kernel_L999
ble .Lizamax_kernel_L999


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble iamax_kernel_S1
ble .Lizamax_kernel_S1


iamax_kernel_S4:
.Lizamax_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -341,26 +341,26 @@ iamax_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne iamax_kernel_S4
bne .Lizamax_kernel_S4


iamax_kernel_S1:
.Lizamax_kernel_S1:


ands I, N, #3 ands I, N, #3
ble iamax_kernel_L999
ble .Lizamax_kernel_L999


iamax_kernel_S10:
.Lizamax_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne iamax_kernel_S10
bne .Lizamax_kernel_S10


iamax_kernel_L999:
.Lizamax_kernel_L999:


mov x0, INDEX mov x0, INDEX
ret ret


iamax_kernel_zero:
.Lizamax_kernel_zero:


mov x0, xzr mov x0, xzr
ret ret


+ 16
- 16
kernel/arm64/nrm2.S View File

@@ -162,44 +162,44 @@ KERNEL_S1_NEXT:
INIT INIT


cmp N, #0 cmp N, #0
ble nrm2_kernel_L999
ble .Lnrm2_kernel_L999


cmp INC_X, #0 cmp INC_X, #0
beq nrm2_kernel_L999
beq .Lnrm2_kernel_L999




cmp INC_X, #1 cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
bne .Lnrm2_kernel_S_BEGIN


nrm2_kernel_F_BEGIN:
.Lnrm2_kernel_F_BEGIN:


asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, xzr cmp I, xzr
ble nrm2_kernel_F1
ble .Lnrm2_kernel_F1


nrm2_kernel_F8:
.Lnrm2_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne nrm2_kernel_F8
bne .Lnrm2_kernel_F8


nrm2_kernel_F1:
.Lnrm2_kernel_F1:


ands I, N, #7 ands I, N, #7
ble nrm2_kernel_L999
ble .Lnrm2_kernel_L999




nrm2_kernel_F10:
.Lnrm2_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne nrm2_kernel_F10
bne .Lnrm2_kernel_F10


b nrm2_kernel_L999
b .Lnrm2_kernel_L999


nrm2_kernel_S_BEGIN:
.Lnrm2_kernel_S_BEGIN:


INIT_S INIT_S


@@ -207,15 +207,15 @@ nrm2_kernel_S_BEGIN:


.align 5 .align 5


nrm2_kernel_S10:
.Lnrm2_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne nrm2_kernel_S10
bne .Lnrm2_kernel_S10




nrm2_kernel_L999:
.Lnrm2_kernel_L999:
fsqrt SSQ, SSQ fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ fmul SSQ, SCALE, SSQ




+ 20
- 20
kernel/arm64/rot.S View File

@@ -165,48 +165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble rot_kernel_L999
ble .Lrot_kernel_L999


INIT INIT


cmp INC_X, #1 cmp INC_X, #1
bne rot_kernel_S_BEGIN
bne .Lrot_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne rot_kernel_S_BEGIN
bne .Lrot_kernel_S_BEGIN


rot_kernel_F_BEGIN:
.Lrot_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq rot_kernel_F1
beq .Lrot_kernel_F1


KERNEL_INIT_F4 KERNEL_INIT_F4


rot_kernel_F4:
.Lrot_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne rot_kernel_F4
bne .Lrot_kernel_F4


rot_kernel_F1:
.Lrot_kernel_F1:


ands I, N, #3 ands I, N, #3
ble rot_kernel_L999
ble .Lrot_kernel_L999


INIT_F1 INIT_F1


rot_kernel_F10:
.Lrot_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne rot_kernel_F10
bne .Lrot_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


rot_kernel_S_BEGIN:
.Lrot_kernel_S_BEGIN:


INIT_S INIT_S
INIT_F1 INIT_F1
@@ -214,9 +214,9 @@ rot_kernel_S_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble rot_kernel_S1
ble .Lrot_kernel_S1


rot_kernel_S4:
.Lrot_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -224,22 +224,22 @@ rot_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne rot_kernel_S4
bne .Lrot_kernel_S4


rot_kernel_S1:
.Lrot_kernel_S1:


ands I, N, #3 ands I, N, #3
ble rot_kernel_L999
ble .Lrot_kernel_L999




rot_kernel_S10:
.Lrot_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne rot_kernel_S10
bne .Lrot_kernel_S10


rot_kernel_L999:
.Lrot_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret

+ 23
- 23
kernel/arm64/scal.S View File

@@ -166,86 +166,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble scal_kernel_L999
ble .Lscal_kernel_L999


fcmp DA, #0.0 fcmp DA, #0.0
beq scal_kernel_zero
beq .Lscal_kernel_zero


cmp INC_X, #1 cmp INC_X, #1
bne scal_kernel_S_BEGIN
bne .Lscal_kernel_S_BEGIN


scal_kernel_F_BEGIN:
.Lscal_kernel_F_BEGIN:


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq scal_kernel_F1
beq .Lscal_kernel_F1


KERNEL_INIT_F8 KERNEL_INIT_F8


scal_kernel_F8:
.Lscal_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne scal_kernel_F8
bne .Lscal_kernel_F8


scal_kernel_F1:
.Lscal_kernel_F1:


ands I, N, #7 ands I, N, #7
ble scal_kernel_L999
ble .Lscal_kernel_L999


scal_kernel_F10:
.Lscal_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne scal_kernel_F10
bne .Lscal_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


scal_kernel_S_BEGIN:
.Lscal_kernel_S_BEGIN:


INIT_S INIT_S
mov X_COPY, X mov X_COPY, X


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble scal_kernel_S1
ble .Lscal_kernel_S1


scal_kernel_S4:
.Lscal_kernel_S4:


KERNEL_S4 KERNEL_S4


subs I, I, #1 subs I, I, #1
bne scal_kernel_S4
bne .Lscal_kernel_S4


scal_kernel_S1:
.Lscal_kernel_S1:


ands I, N, #3 ands I, N, #3
ble scal_kernel_L999
ble .Lscal_kernel_L999


scal_kernel_S10:
.Lscal_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne scal_kernel_S10
bne .Lscal_kernel_S10


scal_kernel_L999:
.Lscal_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret


scal_kernel_zero:
.Lscal_kernel_zero:


INIT_S INIT_S


scal_kernel_Z1:
.Lscal_kernel_Z1:


st1 DAV, [X], INC_X st1 DAV, [X], INC_X
subs N, N, #1 subs N, N, #1
bne scal_kernel_Z1
bne .Lscal_kernel_Z1


mov w0, wzr mov w0, wzr
ret ret


+ 221
- 221
kernel/arm64/sgemm_kernel_16x4.S
File diff suppressed because it is too large
View File


+ 221
- 221
kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S
File diff suppressed because it is too large
View File


+ 155
- 155
kernel/arm64/sgemm_kernel_4x4.S View File

@@ -892,11 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble sgemm_kernel_L2_BEGIN
ble .Lsgemm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


sgemm_kernel_L4_BEGIN:
.Lsgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2


@@ -906,73 +906,73 @@ sgemm_kernel_L4_BEGIN:
add pA_2, temp, pA_1 add pA_2, temp, pA_1
add pA_3, temp, pA_2 add pA_3, temp, pA_2


sgemm_kernel_L4_M16_BEGIN:
.Lsgemm_kernel_L4_M16_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16 asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0 cmp counterI, #0
ble sgemm_kernel_L4_M8_BEGIN
ble .Lsgemm_kernel_L4_M8_BEGIN


sgemm_kernel_L4_M16_20:
.Lsgemm_kernel_L4_M16_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt sgemm_kernel_L4_M16_32
blt .Lsgemm_kernel_L4_M16_32


KERNEL16x4_I // do one in the K KERNEL16x4_I // do one in the K
KERNEL16x4_M2 // do another in the K KERNEL16x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble sgemm_kernel_L4_M16_22a
ble .Lsgemm_kernel_L4_M16_22a
.align 5 .align 5


sgemm_kernel_L4_M16_22:
.Lsgemm_kernel_L4_M16_22:


KERNEL16x4_M1 KERNEL16x4_M1
KERNEL16x4_M2 KERNEL16x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M16_22
bgt .Lsgemm_kernel_L4_M16_22




sgemm_kernel_L4_M16_22a:
.Lsgemm_kernel_L4_M16_22a:


KERNEL16x4_M1 KERNEL16x4_M1
KERNEL16x4_E KERNEL16x4_E


b sgemm_kernel_L4_M16_44
b .Lsgemm_kernel_L4_M16_44


sgemm_kernel_L4_M16_32:
.Lsgemm_kernel_L4_M16_32:


tst counterL, #1 tst counterL, #1
ble sgemm_kernel_L4_M16_40
ble .Lsgemm_kernel_L4_M16_40


KERNEL16x4_I KERNEL16x4_I


KERNEL16x4_E KERNEL16x4_E


b sgemm_kernel_L4_M16_44
b .Lsgemm_kernel_L4_M16_44




sgemm_kernel_L4_M16_40:
.Lsgemm_kernel_L4_M16_40:


INIT16x4 INIT16x4


sgemm_kernel_L4_M16_44:
.Lsgemm_kernel_L4_M16_44:


ands counterL , origK, #1 ands counterL , origK, #1
ble sgemm_kernel_L4_M16_100
ble .Lsgemm_kernel_L4_M16_100


sgemm_kernel_L4_M16_46:
.Lsgemm_kernel_L4_M16_46:


KERNEL16x4_SUB KERNEL16x4_SUB


sgemm_kernel_L4_M16_100:
.Lsgemm_kernel_L4_M16_100:


SAVE16x4 SAVE16x4


sgemm_kernel_L4_M16_END:
.Lsgemm_kernel_L4_M16_END:
lsl temp, origK, #4 // k * 4 * 4 = Four rows of A lsl temp, origK, #4 // k * 4 * 4 = Four rows of A
add pA_0, pA_0, temp add pA_0, pA_0, temp
add pA_0, pA_0, temp add pA_0, pA_0, temp
@@ -981,26 +981,26 @@ sgemm_kernel_L4_M16_END:
add pA_2, pA_1, temp add pA_2, pA_1, temp
add pA_3, pA_2, temp add pA_3, pA_2, temp
subs counterI, counterI, #1 subs counterI, counterI, #1
bne sgemm_kernel_L4_M16_20
bne .Lsgemm_kernel_L4_M16_20


sgemm_kernel_L4_M8_BEGIN:
.Lsgemm_kernel_L4_M8_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #15 tst counterI , #15
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END


tst counterI, #8 tst counterI, #8
ble sgemm_kernel_L4_M4_BEGIN
ble .Lsgemm_kernel_L4_M4_BEGIN


sgemm_kernel_L4_M8_20:
.Lsgemm_kernel_L4_M8_20:


INIT8x4 INIT8x4


mov pB, origPB mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8 asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble sgemm_kernel_L4_M8_40
ble .Lsgemm_kernel_L4_M8_40


sgemm_kernel_L4_M8_22:
.Lsgemm_kernel_L4_M8_22:


KERNEL8x4_SUB KERNEL8x4_SUB
KERNEL8x4_SUB KERNEL8x4_SUB
@@ -1013,47 +1013,47 @@ sgemm_kernel_L4_M8_22:
KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M8_22
bgt .Lsgemm_kernel_L4_M8_22




sgemm_kernel_L4_M8_40:
.Lsgemm_kernel_L4_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M8_100
ble .Lsgemm_kernel_L4_M8_100


sgemm_kernel_L4_M8_42:
.Lsgemm_kernel_L4_M8_42:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M8_42
bgt .Lsgemm_kernel_L4_M8_42


sgemm_kernel_L4_M8_100:
.Lsgemm_kernel_L4_M8_100:


SAVE8x4 SAVE8x4


sgemm_kernel_L4_M8_END:
.Lsgemm_kernel_L4_M8_END:
lsl temp, origK, #4 // k * 4 * 4 lsl temp, origK, #4 // k * 4 * 4
add pA_0, pA_0, temp add pA_0, pA_0, temp


sgemm_kernel_L4_M4_BEGIN:
.Lsgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble sgemm_kernel_L4_M2_BEGIN
ble .Lsgemm_kernel_L4_M2_BEGIN


sgemm_kernel_L4_M4_20:
.Lsgemm_kernel_L4_M4_20:


INIT4x4 INIT4x4


mov pB, origPB mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8 asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble sgemm_kernel_L4_M4_40
ble .Lsgemm_kernel_L4_M4_40


sgemm_kernel_L4_M4_22:
.Lsgemm_kernel_L4_M4_22:


KERNEL4x4_SUB KERNEL4x4_SUB
KERNEL4x4_SUB KERNEL4x4_SUB
@@ -1066,47 +1066,47 @@ sgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M4_22
bgt .Lsgemm_kernel_L4_M4_22




sgemm_kernel_L4_M4_40:
.Lsgemm_kernel_L4_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M4_100
ble .Lsgemm_kernel_L4_M4_100


sgemm_kernel_L4_M4_42:
.Lsgemm_kernel_L4_M4_42:


KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M4_42
bgt .Lsgemm_kernel_L4_M4_42


sgemm_kernel_L4_M4_100:
.Lsgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


sgemm_kernel_L4_M4_END:
.Lsgemm_kernel_L4_M4_END:




sgemm_kernel_L4_M2_BEGIN:
.Lsgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L4_M1_BEGIN
ble .Lsgemm_kernel_L4_M1_BEGIN


sgemm_kernel_L4_M2_20:
.Lsgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L4_M2_40
ble .Lsgemm_kernel_L4_M2_40


sgemm_kernel_L4_M2_22:
.Lsgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1119,43 +1119,43 @@ sgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M2_22
bgt .Lsgemm_kernel_L4_M2_22




sgemm_kernel_L4_M2_40:
.Lsgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M2_100
ble .Lsgemm_kernel_L4_M2_100


sgemm_kernel_L4_M2_42:
.Lsgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M2_42
bgt .Lsgemm_kernel_L4_M2_42


sgemm_kernel_L4_M2_100:
.Lsgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


sgemm_kernel_L4_M2_END:
.Lsgemm_kernel_L4_M2_END:




sgemm_kernel_L4_M1_BEGIN:
.Lsgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END


sgemm_kernel_L4_M1_20:
.Lsgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L4_M1_40
ble .Lsgemm_kernel_L4_M1_40


sgemm_kernel_L4_M1_22:
.Lsgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1167,45 +1167,45 @@ sgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M1_22
bgt .Lsgemm_kernel_L4_M1_22




sgemm_kernel_L4_M1_40:
.Lsgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M1_100
ble .Lsgemm_kernel_L4_M1_100


sgemm_kernel_L4_M1_42:
.Lsgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M1_42
bgt .Lsgemm_kernel_L4_M1_42


sgemm_kernel_L4_M1_100:
.Lsgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




sgemm_kernel_L4_END:
.Lsgemm_kernel_L4_END:


lsl temp, origK, #4 lsl temp, origK, #4
add origPB, origPB, temp // B = B + K * 4 * 4 add origPB, origPB, temp // B = B + K * 4 * 4


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt sgemm_kernel_L4_BEGIN
bgt .Lsgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble sgemm_kernel_L999
ble .Lsgemm_kernel_L999


tst counterJ , #2 tst counterJ , #2
ble sgemm_kernel_L1_BEGIN
ble .Lsgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1215,24 +1215,24 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction






sgemm_kernel_L2_M4_BEGIN:
.Lsgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble sgemm_kernel_L2_M2_BEGIN
ble .Lsgemm_kernel_L2_M2_BEGIN


sgemm_kernel_L2_M4_20:
.Lsgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble sgemm_kernel_L2_M4_40
ble .Lsgemm_kernel_L2_M4_40
.align 5 .align 5


sgemm_kernel_L2_M4_22:
.Lsgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1244,50 +1244,50 @@ sgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M4_22
bgt .Lsgemm_kernel_L2_M4_22




sgemm_kernel_L2_M4_40:
.Lsgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M4_100
ble .Lsgemm_kernel_L2_M4_100


sgemm_kernel_L2_M4_42:
.Lsgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M4_42
bgt .Lsgemm_kernel_L2_M4_42


sgemm_kernel_L2_M4_100:
.Lsgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


sgemm_kernel_L2_M4_END:
.Lsgemm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt sgemm_kernel_L2_M4_20
bgt .Lsgemm_kernel_L2_M4_20




sgemm_kernel_L2_M2_BEGIN:
.Lsgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble sgemm_kernel_L2_END
ble .Lsgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L2_M1_BEGIN
ble .Lsgemm_kernel_L2_M1_BEGIN


sgemm_kernel_L2_M2_20:
.Lsgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble sgemm_kernel_L2_M2_40
ble .Lsgemm_kernel_L2_M2_40


sgemm_kernel_L2_M2_22:
.Lsgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1300,43 +1300,43 @@ sgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M2_22
bgt .Lsgemm_kernel_L2_M2_22




sgemm_kernel_L2_M2_40:
.Lsgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M2_100
ble .Lsgemm_kernel_L2_M2_100


sgemm_kernel_L2_M2_42:
.Lsgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M2_42
bgt .Lsgemm_kernel_L2_M2_42


sgemm_kernel_L2_M2_100:
.Lsgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


sgemm_kernel_L2_M2_END:
.Lsgemm_kernel_L2_M2_END:




sgemm_kernel_L2_M1_BEGIN:
.Lsgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L2_END
ble .Lsgemm_kernel_L2_END


sgemm_kernel_L2_M1_20:
.Lsgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble sgemm_kernel_L2_M1_40
ble .Lsgemm_kernel_L2_M1_40


sgemm_kernel_L2_M1_22:
.Lsgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1348,36 +1348,36 @@ sgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M1_22
bgt .Lsgemm_kernel_L2_M1_22




sgemm_kernel_L2_M1_40:
.Lsgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M1_100
ble .Lsgemm_kernel_L2_M1_100


sgemm_kernel_L2_M1_42:
.Lsgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M1_42
bgt .Lsgemm_kernel_L2_M1_42


sgemm_kernel_L2_M1_100:
.Lsgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




sgemm_kernel_L2_END:
.Lsgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4


/******************************************************************************/ /******************************************************************************/


sgemm_kernel_L1_BEGIN:
.Lsgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble sgemm_kernel_L999 // done
ble .Lsgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1387,24 +1387,24 @@ sgemm_kernel_L1_BEGIN:






sgemm_kernel_L1_M4_BEGIN:
.Lsgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble sgemm_kernel_L1_M2_BEGIN
ble .Lsgemm_kernel_L1_M2_BEGIN


sgemm_kernel_L1_M4_20:
.Lsgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L1_M4_40
ble .Lsgemm_kernel_L1_M4_40
.align 5 .align 5


sgemm_kernel_L1_M4_22:
.Lsgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1416,50 +1416,50 @@ sgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M4_22
bgt .Lsgemm_kernel_L1_M4_22




sgemm_kernel_L1_M4_40:
.Lsgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M4_100
ble .Lsgemm_kernel_L1_M4_100


sgemm_kernel_L1_M4_42:
.Lsgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M4_42
bgt .Lsgemm_kernel_L1_M4_42


sgemm_kernel_L1_M4_100:
.Lsgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


sgemm_kernel_L1_M4_END:
.Lsgemm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt sgemm_kernel_L1_M4_20
bgt .Lsgemm_kernel_L1_M4_20




sgemm_kernel_L1_M2_BEGIN:
.Lsgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble sgemm_kernel_L1_END
ble .Lsgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L1_M1_BEGIN
ble .Lsgemm_kernel_L1_M1_BEGIN


sgemm_kernel_L1_M2_20:
.Lsgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L1_M2_40
ble .Lsgemm_kernel_L1_M2_40


sgemm_kernel_L1_M2_22:
.Lsgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1472,43 +1472,43 @@ sgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M2_22
bgt .Lsgemm_kernel_L1_M2_22




sgemm_kernel_L1_M2_40:
.Lsgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M2_100
ble .Lsgemm_kernel_L1_M2_100


sgemm_kernel_L1_M2_42:
.Lsgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M2_42
bgt .Lsgemm_kernel_L1_M2_42


sgemm_kernel_L1_M2_100:
.Lsgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


sgemm_kernel_L1_M2_END:
.Lsgemm_kernel_L1_M2_END:




sgemm_kernel_L1_M1_BEGIN:
.Lsgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L1_END
ble .Lsgemm_kernel_L1_END


sgemm_kernel_L1_M1_20:
.Lsgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L1_M1_40
ble .Lsgemm_kernel_L1_M1_40


sgemm_kernel_L1_M1_22:
.Lsgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1520,30 +1520,30 @@ sgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M1_22
bgt .Lsgemm_kernel_L1_M1_22




sgemm_kernel_L1_M1_40:
.Lsgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M1_100
ble .Lsgemm_kernel_L1_M1_100


sgemm_kernel_L1_M1_42:
.Lsgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M1_42
bgt .Lsgemm_kernel_L1_M1_42


sgemm_kernel_L1_M1_100:
.Lsgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




sgemm_kernel_L1_END:
.Lsgemm_kernel_L1_END:




sgemm_kernel_L999:
.Lsgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 241
- 241
kernel/arm64/sgemm_kernel_8x8.S
File diff suppressed because it is too large
View File


+ 221
- 221
kernel/arm64/strmm_kernel_16x4.S
File diff suppressed because it is too large
View File


+ 130
- 130
kernel/arm64/strmm_kernel_4x4.S View File

@@ -507,7 +507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


PROLOGUE PROLOGUE


strmm_kernel_begin:
.Lstrmm_kernel_begin:


.align 5 .align 5
add sp, sp, #-(11 * 16) add sp, sp, #-(11 * 16)
@@ -539,11 +539,11 @@ strmm_kernel_begin:
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble strmm_kernel_L2_BEGIN
ble .Lstrmm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


strmm_kernel_L4_BEGIN:
.Lstrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2


@@ -553,14 +553,14 @@ strmm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


strmm_kernel_L4_M4_BEGIN:
.Lstrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble strmm_kernel_L4_M2_BEGIN
ble .Lstrmm_kernel_L4_M2_BEGIN


strmm_kernel_L4_M4_20:
.Lstrmm_kernel_L4_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -581,54 +581,54 @@ strmm_kernel_L4_M4_20:


asr counterL , tempK, #1 // L = K / 2 asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt strmm_kernel_L4_M4_32
blt .Lstrmm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble strmm_kernel_L4_M4_22a
ble .Lstrmm_kernel_L4_M4_22a
.align 5 .align 5


strmm_kernel_L4_M4_22:
.Lstrmm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M4_22
bgt .Lstrmm_kernel_L4_M4_22


strmm_kernel_L4_M4_22a:
.Lstrmm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b strmm_kernel_L4_M4_44
b .Lstrmm_kernel_L4_M4_44


strmm_kernel_L4_M4_32:
.Lstrmm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble strmm_kernel_L4_M4_40
ble .Lstrmm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E


b strmm_kernel_L4_M4_44
b .Lstrmm_kernel_L4_M4_44


strmm_kernel_L4_M4_40:
.Lstrmm_kernel_L4_M4_40:


INIT4x4 INIT4x4


strmm_kernel_L4_M4_44:
.Lstrmm_kernel_L4_M4_44:


ands counterL , tempK, #1 ands counterL , tempK, #1
ble strmm_kernel_L4_M4_100
ble .Lstrmm_kernel_L4_M4_100


strmm_kernel_L4_M4_46:
.Lstrmm_kernel_L4_M4_46:


KERNEL4x4_SUB KERNEL4x4_SUB


strmm_kernel_L4_M4_100:
.Lstrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


@@ -647,20 +647,20 @@ strmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


strmm_kernel_L4_M4_END:
.Lstrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne strmm_kernel_L4_M4_20
bne .Lstrmm_kernel_L4_M4_20


strmm_kernel_L4_M2_BEGIN:
.Lstrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble strmm_kernel_L4_END
ble .Lstrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L4_M1_BEGIN
ble .Lstrmm_kernel_L4_M1_BEGIN


strmm_kernel_L4_M2_20:
.Lstrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -684,9 +684,9 @@ strmm_kernel_L4_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L4_M2_40
ble .Lstrmm_kernel_L4_M2_40


strmm_kernel_L4_M2_22:
.Lstrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -699,22 +699,22 @@ strmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M2_22
bgt .Lstrmm_kernel_L4_M2_22




strmm_kernel_L4_M2_40:
.Lstrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L4_M2_100
ble .Lstrmm_kernel_L4_M2_100


strmm_kernel_L4_M2_42:
.Lstrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M2_42
bgt .Lstrmm_kernel_L4_M2_42


strmm_kernel_L4_M2_100:
.Lstrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -735,15 +735,15 @@ strmm_kernel_L4_M2_100:
#endif #endif




strmm_kernel_L4_M2_END:
.Lstrmm_kernel_L4_M2_END:




strmm_kernel_L4_M1_BEGIN:
.Lstrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L4_END
ble .Lstrmm_kernel_L4_END


strmm_kernel_L4_M1_20:
.Lstrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -767,9 +767,9 @@ strmm_kernel_L4_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L4_M1_40
ble .Lstrmm_kernel_L4_M1_40


strmm_kernel_L4_M1_22:
.Lstrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -781,22 +781,22 @@ strmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M1_22
bgt .Lstrmm_kernel_L4_M1_22




strmm_kernel_L4_M1_40:
.Lstrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L4_M1_100
ble .Lstrmm_kernel_L4_M1_100


strmm_kernel_L4_M1_42:
.Lstrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M1_42
bgt .Lstrmm_kernel_L4_M1_42


strmm_kernel_L4_M1_100:
.Lstrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -817,7 +817,7 @@ strmm_kernel_L4_M1_100:
#endif #endif




strmm_kernel_L4_END:
.Lstrmm_kernel_L4_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4


#if !defined(LEFT) #if !defined(LEFT)
@@ -825,19 +825,19 @@ strmm_kernel_L4_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt strmm_kernel_L4_BEGIN
bgt .Lstrmm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


strmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble strmm_kernel_L999
ble .Lstrmm_kernel_L999


tst counterJ , #2 tst counterJ , #2
ble strmm_kernel_L1_BEGIN
ble .Lstrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -849,14 +849,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov pA, origPA // pA = A mov pA, origPA // pA = A


strmm_kernel_L2_M4_BEGIN:
.Lstrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble strmm_kernel_L2_M2_BEGIN
ble .Lstrmm_kernel_L2_M2_BEGIN


strmm_kernel_L2_M4_20:
.Lstrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -880,10 +880,10 @@ strmm_kernel_L2_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble strmm_kernel_L2_M4_40
ble .Lstrmm_kernel_L2_M4_40
.align 5 .align 5


strmm_kernel_L2_M4_22:
.Lstrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -895,22 +895,22 @@ strmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M4_22
bgt .Lstrmm_kernel_L2_M4_22




strmm_kernel_L2_M4_40:
.Lstrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M4_100
ble .Lstrmm_kernel_L2_M4_100


strmm_kernel_L2_M4_42:
.Lstrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M4_42
bgt .Lstrmm_kernel_L2_M4_42


strmm_kernel_L2_M4_100:
.Lstrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


@@ -930,22 +930,22 @@ strmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


strmm_kernel_L2_M4_END:
.Lstrmm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt strmm_kernel_L2_M4_20
bgt .Lstrmm_kernel_L2_M4_20




strmm_kernel_L2_M2_BEGIN:
.Lstrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble strmm_kernel_L2_END
ble .Lstrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L2_M1_BEGIN
ble .Lstrmm_kernel_L2_M1_BEGIN


strmm_kernel_L2_M2_20:
.Lstrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -969,9 +969,9 @@ strmm_kernel_L2_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble strmm_kernel_L2_M2_40
ble .Lstrmm_kernel_L2_M2_40


strmm_kernel_L2_M2_22:
.Lstrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -984,22 +984,22 @@ strmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M2_22
bgt .Lstrmm_kernel_L2_M2_22




strmm_kernel_L2_M2_40:
.Lstrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M2_100
ble .Lstrmm_kernel_L2_M2_100


strmm_kernel_L2_M2_42:
.Lstrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M2_42
bgt .Lstrmm_kernel_L2_M2_42


strmm_kernel_L2_M2_100:
.Lstrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1018,15 +1018,15 @@ strmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


strmm_kernel_L2_M2_END:
.Lstrmm_kernel_L2_M2_END:




strmm_kernel_L2_M1_BEGIN:
.Lstrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L2_END
ble .Lstrmm_kernel_L2_END


strmm_kernel_L2_M1_20:
.Lstrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1050,9 +1050,9 @@ strmm_kernel_L2_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble strmm_kernel_L2_M1_40
ble .Lstrmm_kernel_L2_M1_40


strmm_kernel_L2_M1_22:
.Lstrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1064,22 +1064,22 @@ strmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M1_22
bgt .Lstrmm_kernel_L2_M1_22




strmm_kernel_L2_M1_40:
.Lstrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M1_100
ble .Lstrmm_kernel_L2_M1_100


strmm_kernel_L2_M1_42:
.Lstrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M1_42
bgt .Lstrmm_kernel_L2_M1_42


strmm_kernel_L2_M1_100:
.Lstrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -1099,7 +1099,7 @@ strmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


strmm_kernel_L2_END:
.Lstrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -1107,11 +1107,11 @@ strmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


strmm_kernel_L1_BEGIN:
.Lstrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble strmm_kernel_L999 // done
ble .Lstrmm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1123,14 +1123,14 @@ strmm_kernel_L1_BEGIN:


mov pA, origPA // pA = A mov pA, origPA // pA = A


strmm_kernel_L1_M4_BEGIN:
.Lstrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble strmm_kernel_L1_M2_BEGIN
ble .Lstrmm_kernel_L1_M2_BEGIN


strmm_kernel_L1_M4_20:
.Lstrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -1154,10 +1154,10 @@ strmm_kernel_L1_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L1_M4_40
ble .Lstrmm_kernel_L1_M4_40
.align 5 .align 5


strmm_kernel_L1_M4_22:
.Lstrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1169,22 +1169,22 @@ strmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M4_22
bgt .Lstrmm_kernel_L1_M4_22




strmm_kernel_L1_M4_40:
.Lstrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M4_100
ble .Lstrmm_kernel_L1_M4_100


strmm_kernel_L1_M4_42:
.Lstrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M4_42
bgt .Lstrmm_kernel_L1_M4_42


strmm_kernel_L1_M4_100:
.Lstrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


@@ -1204,22 +1204,22 @@ strmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


strmm_kernel_L1_M4_END:
.Lstrmm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt strmm_kernel_L1_M4_20
bgt .Lstrmm_kernel_L1_M4_20




strmm_kernel_L1_M2_BEGIN:
.Lstrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble strmm_kernel_L1_END
ble .Lstrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L1_M1_BEGIN
ble .Lstrmm_kernel_L1_M1_BEGIN


strmm_kernel_L1_M2_20:
.Lstrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1243,9 +1243,9 @@ strmm_kernel_L1_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L1_M2_40
ble .Lstrmm_kernel_L1_M2_40


strmm_kernel_L1_M2_22:
.Lstrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1258,22 +1258,22 @@ strmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M2_22
bgt .Lstrmm_kernel_L1_M2_22




strmm_kernel_L1_M2_40:
.Lstrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M2_100
ble .Lstrmm_kernel_L1_M2_100


strmm_kernel_L1_M2_42:
.Lstrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M2_42
bgt .Lstrmm_kernel_L1_M2_42


strmm_kernel_L1_M2_100:
.Lstrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -1294,15 +1294,15 @@ strmm_kernel_L1_M2_100:
#endif #endif




strmm_kernel_L1_M2_END:
.Lstrmm_kernel_L1_M2_END:




strmm_kernel_L1_M1_BEGIN:
.Lstrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L1_END
ble .Lstrmm_kernel_L1_END


strmm_kernel_L1_M1_20:
.Lstrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1326,9 +1326,9 @@ strmm_kernel_L1_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L1_M1_40
ble .Lstrmm_kernel_L1_M1_40


strmm_kernel_L1_M1_22:
.Lstrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1340,22 +1340,22 @@ strmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M1_22
bgt .Lstrmm_kernel_L1_M1_22




strmm_kernel_L1_M1_40:
.Lstrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M1_100
ble .Lstrmm_kernel_L1_M1_100


strmm_kernel_L1_M1_42:
.Lstrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M1_42
bgt .Lstrmm_kernel_L1_M1_42


strmm_kernel_L1_M1_100:
.Lstrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1


@@ -1377,7 +1377,7 @@ strmm_kernel_L1_M1_100:
#endif #endif
#endif #endif


strmm_kernel_L1_END:
.Lstrmm_kernel_L1_END:


#if 0 #if 0
#if !defined(LEFT) #if !defined(LEFT)
@@ -1385,7 +1385,7 @@ strmm_kernel_L1_END:
#endif #endif
#endif #endif


strmm_kernel_L999:
.Lstrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 241
- 241
kernel/arm64/strmm_kernel_8x8.S
File diff suppressed because it is too large
View File


+ 21
- 21
kernel/arm64/swap.S View File

@@ -193,50 +193,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble swap_kernel_L999
ble .Lswap_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne swap_kernel_S_BEGIN
bne .Lswap_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne swap_kernel_S_BEGIN
bne .Lswap_kernel_S_BEGIN


swap_kernel_F_BEGIN:
.Lswap_kernel_F_BEGIN:


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq swap_kernel_F1
beq .Lswap_kernel_F1


swap_kernel_F8:
.Lswap_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne swap_kernel_F8
bne .Lswap_kernel_F8


swap_kernel_F1:
.Lswap_kernel_F1:


ands I, N, #7 ands I, N, #7
ble swap_kernel_L999
ble .Lswap_kernel_L999


swap_kernel_F10:
.Lswap_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne swap_kernel_F10
bne .Lswap_kernel_F10


b swap_kernel_L999
b .Lswap_kernel_L999




swap_kernel_S_BEGIN:
.Lswap_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble swap_kernel_S1
ble .Lswap_kernel_S1


swap_kernel_S4:
.Lswap_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -244,21 +244,21 @@ swap_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne swap_kernel_S4
bne .Lswap_kernel_S4


swap_kernel_S1:
.Lswap_kernel_S1:


ands I, N, #3 ands I, N, #3
ble swap_kernel_L999
ble .Lswap_kernel_L999


swap_kernel_S10:
.Lswap_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne swap_kernel_S10
bne .Lswap_kernel_S10


swap_kernel_L999:
.Lswap_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret


+ 25
- 25
kernel/arm64/zamax.S View File

@@ -184,62 +184,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble amax_kernel_zero
ble .Lzamax_kernel_zero
cmp INC_X, xzr cmp INC_X, xzr
ble amax_kernel_zero
ble .Lzamax_kernel_zero


cmp INC_X, #1 cmp INC_X, #1
bne amax_kernel_S_BEGIN
bne .Lzamax_kernel_S_BEGIN


amax_kernel_F_BEGIN:
.Lzamax_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq amax_kernel_F1_INIT
beq .Lzamax_kernel_F1_INIT


INIT_F4 INIT_F4
subs I, I, #1 subs I, I, #1
beq amax_kernel_F1
beq .Lzamax_kernel_F1


amax_kernel_F4:
.Lzamax_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne amax_kernel_F4
bne .Lzamax_kernel_F4


amax_kernel_F1:
.Lzamax_kernel_F1:


ands I, N, #3 ands I, N, #3
ble amax_kernel_L999
ble .Lzamax_kernel_L999


amax_kernel_F10:
.Lzamax_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne amax_kernel_F10
bne .Lzamax_kernel_F10


ret ret


amax_kernel_F1_INIT:
.Lzamax_kernel_F1_INIT:


INIT_F1 INIT_F1
subs N, N, #1 subs N, N, #1
b amax_kernel_F1
b .Lzamax_kernel_F1


amax_kernel_S_BEGIN:
.Lzamax_kernel_S_BEGIN:


INIT_S INIT_S


subs N, N, #1 subs N, N, #1
ble amax_kernel_L999
ble .Lzamax_kernel_L999


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble amax_kernel_S1
ble .Lzamax_kernel_S1


amax_kernel_S4:
.Lzamax_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -247,25 +247,25 @@ amax_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne amax_kernel_S4
bne .Lzamax_kernel_S4


amax_kernel_S1:
.Lzamax_kernel_S1:


ands I, N, #3 ands I, N, #3
ble amax_kernel_L999
ble .Lzamax_kernel_L999


amax_kernel_S10:
.Lzamax_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne amax_kernel_S10
bne .Lzamax_kernel_S10


amax_kernel_L999:
.Lzamax_kernel_L999:


ret ret


amax_kernel_zero:
.Lzamax_kernel_zero:


fmov MAXF, REG0 fmov MAXF, REG0
ret ret


+ 20
- 20
kernel/arm64/zasum.S View File

@@ -92,52 +92,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov SUMF, REG0 fmov SUMF, REG0


cmp N, xzr cmp N, xzr
ble asum_kernel_L999
ble .Lzasum_kernel_L999
cmp INC_X, xzr cmp INC_X, xzr
ble asum_kernel_L999
ble .Lzasum_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lzasum_kernel_S_BEGIN


asum_kernel_F_BEGIN:
.Lzasum_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq asum_kernel_F1
beq .Lzasum_kernel_F1


asum_kernel_F4:
.Lzasum_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne asum_kernel_F4
bne .Lzasum_kernel_F4


KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE


asum_kernel_F1:
.Lzasum_kernel_F1:


ands I, N, #3 ands I, N, #3
ble asum_kernel_L999
ble .Lzasum_kernel_L999


asum_kernel_F10:
.Lzasum_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne asum_kernel_F10
bne .Lzasum_kernel_F10


asum_kernel_L999:
.Lzasum_kernel_L999:
ret ret


asum_kernel_S_BEGIN:
.Lzasum_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble asum_kernel_S1
ble .Lzasum_kernel_S1


asum_kernel_S4:
.Lzasum_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -145,19 +145,19 @@ asum_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne asum_kernel_S4
bne .Lzasum_kernel_S4


asum_kernel_S1:
.Lzasum_kernel_S1:


ands I, N, #3 ands I, N, #3
ble asum_kernel_L999
ble .Lzasum_kernel_L999


asum_kernel_S10:
.Lzasum_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne asum_kernel_S10
bne .Lzasum_kernel_S10


ret ret




+ 21
- 21
kernel/arm64/zaxpy.S View File

@@ -241,62 +241,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999


mov Y_COPY, Y mov Y_COPY, Y


fcmp DA_R, #0.0 fcmp DA_R, #0.0
bne .L1 bne .L1
fcmp DA_I, #0.0 fcmp DA_I, #0.0
beq zaxpy_kernel_L999
beq .Lzaxpy_kernel_L999


.L1: .L1:
INIT INIT


cmp INC_X, #1 cmp INC_X, #1
bne zaxpy_kernel_S_BEGIN
bne .Lzaxpy_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne zaxpy_kernel_S_BEGIN
bne .Lzaxpy_kernel_S_BEGIN


zaxpy_kernel_F_BEGIN:
.Lzaxpy_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq zaxpy_kernel_F1
beq .Lzaxpy_kernel_F1


KERNEL_INIT_F4 KERNEL_INIT_F4


zaxpy_kernel_F4:
.Lzaxpy_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne zaxpy_kernel_F4
bne .Lzaxpy_kernel_F4


zaxpy_kernel_F1:
.Lzaxpy_kernel_F1:


ands I, N, #3 ands I, N, #3
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999


zaxpy_kernel_F10:
.Lzaxpy_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne zaxpy_kernel_F10
bne .Lzaxpy_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


zaxpy_kernel_S_BEGIN:
.Lzaxpy_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble zaxpy_kernel_S1
ble .Lzaxpy_kernel_S1


zaxpy_kernel_S4:
.Lzaxpy_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -304,21 +304,21 @@ zaxpy_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zaxpy_kernel_S4
bne .Lzaxpy_kernel_S4


zaxpy_kernel_S1:
.Lzaxpy_kernel_S1:


ands I, N, #3 ands I, N, #3
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999


zaxpy_kernel_S10:
.Lzaxpy_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zaxpy_kernel_S10
bne .Lzaxpy_kernel_S10


zaxpy_kernel_L999:
.Lzaxpy_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret

+ 20
- 20
kernel/arm64/zdot.S View File

@@ -229,51 +229,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif


cmp N, xzr cmp N, xzr
ble dot_kernel_L999
ble .Lzdot_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne dot_kernel_S_BEGIN
bne .Lzdot_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne dot_kernel_S_BEGIN
bne .Lzdot_kernel_S_BEGIN


dot_kernel_F_BEGIN:
.Lzdot_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq dot_kernel_F1
beq .Lzdot_kernel_F1


dot_kernel_F4:
.Lzdot_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne dot_kernel_F4
bne .Lzdot_kernel_F4


KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE


dot_kernel_F1:
.Lzdot_kernel_F1:


ands I, N, #3 ands I, N, #3
ble dot_kernel_L999
ble .Lzdot_kernel_L999


dot_kernel_F10:
.Lzdot_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne dot_kernel_F10
bne .Lzdot_kernel_F10


ret ret


dot_kernel_S_BEGIN:
.Lzdot_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble dot_kernel_S1
ble .Lzdot_kernel_S1


dot_kernel_S4:
.Lzdot_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -281,21 +281,21 @@ dot_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne dot_kernel_S4
bne .Lzdot_kernel_S4


dot_kernel_S1:
.Lzdot_kernel_S1:


ands I, N, #3 ands I, N, #3
ble dot_kernel_L999
ble .Lzdot_kernel_L999


dot_kernel_S10:
.Lzdot_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne dot_kernel_S10
bne .Lzdot_kernel_S10


dot_kernel_L999:
.Lzdot_kernel_L999:


ret ret




+ 130
- 130
kernel/arm64/zgemm_kernel_4x4.S View File

@@ -1099,9 +1099,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble zgemm_kernel_L2_BEGIN
ble .Lzgemm_kernel_L2_BEGIN


zgemm_kernel_L4_BEGIN:
.Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -1111,20 +1111,20 @@ zgemm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


zgemm_kernel_L4_M4_BEGIN:
.Lzgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
ble .Lzgemm_kernel_L4_M2_BEGIN


.align 5 .align 5
zgemm_kernel_L4_M4_20:
.Lzgemm_kernel_L4_M4_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 asr counterL , origK, #3
cmp counterL , #2 cmp counterL , #2
blt zgemm_kernel_L4_M4_32
blt .Lzgemm_kernel_L4_M4_32


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1136,10 +1136,10 @@ zgemm_kernel_L4_M4_20:
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
ble .Lzgemm_kernel_L4_M4_22a


.align 5 .align 5
zgemm_kernel_L4_M4_22:
.Lzgemm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1151,10 +1151,10 @@ zgemm_kernel_L4_M4_22:
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
bgt .Lzgemm_kernel_L4_M4_22


.align 5 .align 5
zgemm_kernel_L4_M4_22a:
.Lzgemm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1165,13 +1165,13 @@ zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44


.align 5 .align 5
zgemm_kernel_L4_M4_32:
.Lzgemm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble zgemm_kernel_L4_M4_40
ble .Lzgemm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1182,55 +1182,55 @@ zgemm_kernel_L4_M4_32:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44




zgemm_kernel_L4_M4_40:
.Lzgemm_kernel_L4_M4_40:


INIT4x4 INIT4x4


zgemm_kernel_L4_M4_44:
.Lzgemm_kernel_L4_M4_44:


ands counterL , origK, #7 ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
ble .Lzgemm_kernel_L4_M4_100


.align 5 .align 5
zgemm_kernel_L4_M4_46:
.Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
bne .Lzgemm_kernel_L4_M4_46


zgemm_kernel_L4_M4_100:
.Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


SAVE4x4 SAVE4x4


zgemm_kernel_L4_M4_END:
.Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne zgemm_kernel_L4_M4_20
bne .Lzgemm_kernel_L4_M4_20


zgemm_kernel_L4_M2_BEGIN:
.Lzgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L4_M1_BEGIN
ble .Lzgemm_kernel_L4_M1_BEGIN


zgemm_kernel_L4_M2_20:
.Lzgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L4_M2_40
ble .Lzgemm_kernel_L4_M2_40


zgemm_kernel_L4_M2_22:
.Lzgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1243,43 +1243,43 @@ zgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_22
bgt .Lzgemm_kernel_L4_M2_22




zgemm_kernel_L4_M2_40:
.Lzgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M2_100
ble .Lzgemm_kernel_L4_M2_100


zgemm_kernel_L4_M2_42:
.Lzgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_42
bgt .Lzgemm_kernel_L4_M2_42


zgemm_kernel_L4_M2_100:
.Lzgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


zgemm_kernel_L4_M2_END:
.Lzgemm_kernel_L4_M2_END:




zgemm_kernel_L4_M1_BEGIN:
.Lzgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END


zgemm_kernel_L4_M1_20:
.Lzgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L4_M1_40
ble .Lzgemm_kernel_L4_M1_40


zgemm_kernel_L4_M1_22:
.Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1291,45 +1291,45 @@ zgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_22
bgt .Lzgemm_kernel_L4_M1_22




zgemm_kernel_L4_M1_40:
.Lzgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M1_100
ble .Lzgemm_kernel_L4_M1_100


zgemm_kernel_L4_M1_42:
.Lzgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_42
bgt .Lzgemm_kernel_L4_M1_42


zgemm_kernel_L4_M1_100:
.Lzgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




zgemm_kernel_L4_END:
.Lzgemm_kernel_L4_END:


lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2 add origPB, origPB, temp // B = B + K * 4 * 8 * 2


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt zgemm_kernel_L4_BEGIN
bgt .Lzgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble zgemm_kernel_L999
ble .Lzgemm_kernel_L999


tst counterJ , #2 tst counterJ , #2
ble zgemm_kernel_L1_BEGIN
ble .Lzgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1339,24 +1339,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction






zgemm_kernel_L2_M4_BEGIN:
.Lzgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble zgemm_kernel_L2_M2_BEGIN
ble .Lzgemm_kernel_L2_M2_BEGIN


zgemm_kernel_L2_M4_20:
.Lzgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble zgemm_kernel_L2_M4_40
ble .Lzgemm_kernel_L2_M4_40
.align 5 .align 5


zgemm_kernel_L2_M4_22:
.Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1368,50 +1368,50 @@ zgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_22
bgt .Lzgemm_kernel_L2_M4_22




zgemm_kernel_L2_M4_40:
.Lzgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M4_100
ble .Lzgemm_kernel_L2_M4_100


zgemm_kernel_L2_M4_42:
.Lzgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_42
bgt .Lzgemm_kernel_L2_M4_42


zgemm_kernel_L2_M4_100:
.Lzgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


zgemm_kernel_L2_M4_END:
.Lzgemm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt zgemm_kernel_L2_M4_20
bgt .Lzgemm_kernel_L2_M4_20




zgemm_kernel_L2_M2_BEGIN:
.Lzgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L2_M1_BEGIN
ble .Lzgemm_kernel_L2_M1_BEGIN


zgemm_kernel_L2_M2_20:
.Lzgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble zgemm_kernel_L2_M2_40
ble .Lzgemm_kernel_L2_M2_40


zgemm_kernel_L2_M2_22:
.Lzgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1424,43 +1424,43 @@ zgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_22
bgt .Lzgemm_kernel_L2_M2_22




zgemm_kernel_L2_M2_40:
.Lzgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M2_100
ble .Lzgemm_kernel_L2_M2_100


zgemm_kernel_L2_M2_42:
.Lzgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_42
bgt .Lzgemm_kernel_L2_M2_42


zgemm_kernel_L2_M2_100:
.Lzgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


zgemm_kernel_L2_M2_END:
.Lzgemm_kernel_L2_M2_END:




zgemm_kernel_L2_M1_BEGIN:
.Lzgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END


zgemm_kernel_L2_M1_20:
.Lzgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble zgemm_kernel_L2_M1_40
ble .Lzgemm_kernel_L2_M1_40


zgemm_kernel_L2_M1_22:
.Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1472,37 +1472,37 @@ zgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_22
bgt .Lzgemm_kernel_L2_M1_22




zgemm_kernel_L2_M1_40:
.Lzgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M1_100
ble .Lzgemm_kernel_L2_M1_100


zgemm_kernel_L2_M1_42:
.Lzgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_42
bgt .Lzgemm_kernel_L2_M1_42


zgemm_kernel_L2_M1_100:
.Lzgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




zgemm_kernel_L2_END:
.Lzgemm_kernel_L2_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2 add origPB, origPB, temp // B = B + K * 2 * 8 * 2


/******************************************************************************/ /******************************************************************************/


zgemm_kernel_L1_BEGIN:
.Lzgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble zgemm_kernel_L999 // done
ble .Lzgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1512,24 +1512,24 @@ zgemm_kernel_L1_BEGIN:






zgemm_kernel_L1_M4_BEGIN:
.Lzgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble zgemm_kernel_L1_M2_BEGIN
ble .Lzgemm_kernel_L1_M2_BEGIN


zgemm_kernel_L1_M4_20:
.Lzgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M4_40
ble .Lzgemm_kernel_L1_M4_40
.align 5 .align 5


zgemm_kernel_L1_M4_22:
.Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1541,50 +1541,50 @@ zgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_22
bgt .Lzgemm_kernel_L1_M4_22




zgemm_kernel_L1_M4_40:
.Lzgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M4_100
ble .Lzgemm_kernel_L1_M4_100


zgemm_kernel_L1_M4_42:
.Lzgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_42
bgt .Lzgemm_kernel_L1_M4_42


zgemm_kernel_L1_M4_100:
.Lzgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


zgemm_kernel_L1_M4_END:
.Lzgemm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt zgemm_kernel_L1_M4_20
bgt .Lzgemm_kernel_L1_M4_20




zgemm_kernel_L1_M2_BEGIN:
.Lzgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L1_M1_BEGIN
ble .Lzgemm_kernel_L1_M1_BEGIN


zgemm_kernel_L1_M2_20:
.Lzgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M2_40
ble .Lzgemm_kernel_L1_M2_40


zgemm_kernel_L1_M2_22:
.Lzgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1597,43 +1597,43 @@ zgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_22
bgt .Lzgemm_kernel_L1_M2_22




zgemm_kernel_L1_M2_40:
.Lzgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M2_100
ble .Lzgemm_kernel_L1_M2_100


zgemm_kernel_L1_M2_42:
.Lzgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_42
bgt .Lzgemm_kernel_L1_M2_42


zgemm_kernel_L1_M2_100:
.Lzgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


zgemm_kernel_L1_M2_END:
.Lzgemm_kernel_L1_M2_END:




zgemm_kernel_L1_M1_BEGIN:
.Lzgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END


zgemm_kernel_L1_M1_20:
.Lzgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M1_40
ble .Lzgemm_kernel_L1_M1_40


zgemm_kernel_L1_M1_22:
.Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1645,30 +1645,30 @@ zgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_22
bgt .Lzgemm_kernel_L1_M1_22




zgemm_kernel_L1_M1_40:
.Lzgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M1_100
ble .Lzgemm_kernel_L1_M1_100


zgemm_kernel_L1_M1_42:
.Lzgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_42
bgt .Lzgemm_kernel_L1_M1_42


zgemm_kernel_L1_M1_100:
.Lzgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




zgemm_kernel_L1_END:
.Lzgemm_kernel_L1_END:




zgemm_kernel_L999:
.Lzgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 130
- 130
kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S View File

@@ -1109,9 +1109,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble zgemm_kernel_L2_BEGIN
ble .Lzgemm_kernel_L2_BEGIN


zgemm_kernel_L4_BEGIN:
.Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -1121,20 +1121,20 @@ zgemm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


zgemm_kernel_L4_M4_BEGIN:
.Lzgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
ble .Lzgemm_kernel_L4_M2_BEGIN


.align 5 .align 5
zgemm_kernel_L4_M4_20:
.Lzgemm_kernel_L4_M4_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 asr counterL , origK, #3
cmp counterL , #2 cmp counterL , #2
blt zgemm_kernel_L4_M4_32
blt .Lzgemm_kernel_L4_M4_32


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1146,10 +1146,10 @@ zgemm_kernel_L4_M4_20:
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
ble .Lzgemm_kernel_L4_M4_22a


.align 5 .align 5
zgemm_kernel_L4_M4_22:
.Lzgemm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1161,10 +1161,10 @@ zgemm_kernel_L4_M4_22:
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
bgt .Lzgemm_kernel_L4_M4_22


.align 5 .align 5
zgemm_kernel_L4_M4_22a:
.Lzgemm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1175,13 +1175,13 @@ zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44


.align 5 .align 5
zgemm_kernel_L4_M4_32:
.Lzgemm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble zgemm_kernel_L4_M4_40
ble .Lzgemm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1192,55 +1192,55 @@ zgemm_kernel_L4_M4_32:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44




zgemm_kernel_L4_M4_40:
.Lzgemm_kernel_L4_M4_40:


INIT4x4 INIT4x4


zgemm_kernel_L4_M4_44:
.Lzgemm_kernel_L4_M4_44:


ands counterL , origK, #7 ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
ble .Lzgemm_kernel_L4_M4_100


.align 5 .align 5
zgemm_kernel_L4_M4_46:
.Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
bne .Lzgemm_kernel_L4_M4_46


zgemm_kernel_L4_M4_100:
.Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


SAVE4x4 SAVE4x4


zgemm_kernel_L4_M4_END:
.Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne zgemm_kernel_L4_M4_20
bne .Lzgemm_kernel_L4_M4_20


zgemm_kernel_L4_M2_BEGIN:
.Lzgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L4_M1_BEGIN
ble .Lzgemm_kernel_L4_M1_BEGIN


zgemm_kernel_L4_M2_20:
.Lzgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L4_M2_40
ble .Lzgemm_kernel_L4_M2_40


zgemm_kernel_L4_M2_22:
.Lzgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1253,43 +1253,43 @@ zgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_22
bgt .Lzgemm_kernel_L4_M2_22




zgemm_kernel_L4_M2_40:
.Lzgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M2_100
ble .Lzgemm_kernel_L4_M2_100


zgemm_kernel_L4_M2_42:
.Lzgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_42
bgt .Lzgemm_kernel_L4_M2_42


zgemm_kernel_L4_M2_100:
.Lzgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


zgemm_kernel_L4_M2_END:
.Lzgemm_kernel_L4_M2_END:




zgemm_kernel_L4_M1_BEGIN:
.Lzgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END


zgemm_kernel_L4_M1_20:
.Lzgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L4_M1_40
ble .Lzgemm_kernel_L4_M1_40


zgemm_kernel_L4_M1_22:
.Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1301,45 +1301,45 @@ zgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_22
bgt .Lzgemm_kernel_L4_M1_22




zgemm_kernel_L4_M1_40:
.Lzgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M1_100
ble .Lzgemm_kernel_L4_M1_100


zgemm_kernel_L4_M1_42:
.Lzgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_42
bgt .Lzgemm_kernel_L4_M1_42


zgemm_kernel_L4_M1_100:
.Lzgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




zgemm_kernel_L4_END:
.Lzgemm_kernel_L4_END:


lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2 add origPB, origPB, temp // B = B + K * 4 * 8 * 2


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt zgemm_kernel_L4_BEGIN
bgt .Lzgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble zgemm_kernel_L999
ble .Lzgemm_kernel_L999


tst counterJ , #2 tst counterJ , #2
ble zgemm_kernel_L1_BEGIN
ble .Lzgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1349,24 +1349,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction






zgemm_kernel_L2_M4_BEGIN:
.Lzgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble zgemm_kernel_L2_M2_BEGIN
ble .Lzgemm_kernel_L2_M2_BEGIN


zgemm_kernel_L2_M4_20:
.Lzgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble zgemm_kernel_L2_M4_40
ble .Lzgemm_kernel_L2_M4_40
.align 5 .align 5


zgemm_kernel_L2_M4_22:
.Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1378,50 +1378,50 @@ zgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_22
bgt .Lzgemm_kernel_L2_M4_22




zgemm_kernel_L2_M4_40:
.Lzgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M4_100
ble .Lzgemm_kernel_L2_M4_100


zgemm_kernel_L2_M4_42:
.Lzgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_42
bgt .Lzgemm_kernel_L2_M4_42


zgemm_kernel_L2_M4_100:
.Lzgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


zgemm_kernel_L2_M4_END:
.Lzgemm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt zgemm_kernel_L2_M4_20
bgt .Lzgemm_kernel_L2_M4_20




zgemm_kernel_L2_M2_BEGIN:
.Lzgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L2_M1_BEGIN
ble .Lzgemm_kernel_L2_M1_BEGIN


zgemm_kernel_L2_M2_20:
.Lzgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble zgemm_kernel_L2_M2_40
ble .Lzgemm_kernel_L2_M2_40


zgemm_kernel_L2_M2_22:
.Lzgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1434,43 +1434,43 @@ zgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_22
bgt .Lzgemm_kernel_L2_M2_22




zgemm_kernel_L2_M2_40:
.Lzgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M2_100
ble .Lzgemm_kernel_L2_M2_100


zgemm_kernel_L2_M2_42:
.Lzgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_42
bgt .Lzgemm_kernel_L2_M2_42


zgemm_kernel_L2_M2_100:
.Lzgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


zgemm_kernel_L2_M2_END:
.Lzgemm_kernel_L2_M2_END:




zgemm_kernel_L2_M1_BEGIN:
.Lzgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END


zgemm_kernel_L2_M1_20:
.Lzgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble zgemm_kernel_L2_M1_40
ble .Lzgemm_kernel_L2_M1_40


zgemm_kernel_L2_M1_22:
.Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1482,37 +1482,37 @@ zgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_22
bgt .Lzgemm_kernel_L2_M1_22




zgemm_kernel_L2_M1_40:
.Lzgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M1_100
ble .Lzgemm_kernel_L2_M1_100


zgemm_kernel_L2_M1_42:
.Lzgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_42
bgt .Lzgemm_kernel_L2_M1_42


zgemm_kernel_L2_M1_100:
.Lzgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




zgemm_kernel_L2_END:
.Lzgemm_kernel_L2_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2 add origPB, origPB, temp // B = B + K * 2 * 8 * 2


/******************************************************************************/ /******************************************************************************/


zgemm_kernel_L1_BEGIN:
.Lzgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble zgemm_kernel_L999 // done
ble .Lzgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1522,24 +1522,24 @@ zgemm_kernel_L1_BEGIN:






zgemm_kernel_L1_M4_BEGIN:
.Lzgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble zgemm_kernel_L1_M2_BEGIN
ble .Lzgemm_kernel_L1_M2_BEGIN


zgemm_kernel_L1_M4_20:
.Lzgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M4_40
ble .Lzgemm_kernel_L1_M4_40
.align 5 .align 5


zgemm_kernel_L1_M4_22:
.Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1551,50 +1551,50 @@ zgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_22
bgt .Lzgemm_kernel_L1_M4_22




zgemm_kernel_L1_M4_40:
.Lzgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M4_100
ble .Lzgemm_kernel_L1_M4_100


zgemm_kernel_L1_M4_42:
.Lzgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_42
bgt .Lzgemm_kernel_L1_M4_42


zgemm_kernel_L1_M4_100:
.Lzgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


zgemm_kernel_L1_M4_END:
.Lzgemm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt zgemm_kernel_L1_M4_20
bgt .Lzgemm_kernel_L1_M4_20




zgemm_kernel_L1_M2_BEGIN:
.Lzgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L1_M1_BEGIN
ble .Lzgemm_kernel_L1_M1_BEGIN


zgemm_kernel_L1_M2_20:
.Lzgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M2_40
ble .Lzgemm_kernel_L1_M2_40


zgemm_kernel_L1_M2_22:
.Lzgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1607,43 +1607,43 @@ zgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_22
bgt .Lzgemm_kernel_L1_M2_22




zgemm_kernel_L1_M2_40:
.Lzgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M2_100
ble .Lzgemm_kernel_L1_M2_100


zgemm_kernel_L1_M2_42:
.Lzgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_42
bgt .Lzgemm_kernel_L1_M2_42


zgemm_kernel_L1_M2_100:
.Lzgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


zgemm_kernel_L1_M2_END:
.Lzgemm_kernel_L1_M2_END:




zgemm_kernel_L1_M1_BEGIN:
.Lzgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END


zgemm_kernel_L1_M1_20:
.Lzgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M1_40
ble .Lzgemm_kernel_L1_M1_40


zgemm_kernel_L1_M1_22:
.Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1655,30 +1655,30 @@ zgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_22
bgt .Lzgemm_kernel_L1_M1_22




zgemm_kernel_L1_M1_40:
.Lzgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M1_100
ble .Lzgemm_kernel_L1_M1_100


zgemm_kernel_L1_M1_42:
.Lzgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_42
bgt .Lzgemm_kernel_L1_M1_42


zgemm_kernel_L1_M1_100:
.Lzgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




zgemm_kernel_L1_END:
.Lzgemm_kernel_L1_END:




zgemm_kernel_L999:
.Lzgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 26
- 26
kernel/arm64/zgemv_n.S View File

@@ -364,9 +364,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS SAVE_REGS


cmp N, xzr cmp N, xzr
ble zgemv_n_kernel_L999
ble .Lzgemv_n_kernel_L999
cmp M, xzr cmp M, xzr
ble zgemv_n_kernel_L999
ble .Lzgemv_n_kernel_L999


lsl LDA, LDA, #SHZ lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ lsl INC_X, INC_X, #SHZ
@@ -375,9 +375,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INIT INIT


cmp INC_Y, #1 cmp INC_Y, #1
bne zgemv_n_kernel_S_BEGIN
bne .Lzgemv_n_kernel_S_BEGIN


zgemv_n_kernel_F_LOOP:
.Lzgemv_n_kernel_F_LOOP:
mov A_PTR, A mov A_PTR, A
mov Y_IPTR, Y mov Y_IPTR, Y
mov Y_OPTR, Y mov Y_OPTR, Y
@@ -387,40 +387,40 @@ zgemv_n_kernel_F_LOOP:


asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
beq zgemv_n_kernel_F1
beq .Lzgemv_n_kernel_F1


zgemv_n_kernel_F4:
.Lzgemv_n_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne zgemv_n_kernel_F4
bne .Lzgemv_n_kernel_F4


zgemv_n_kernel_F1:
.Lzgemv_n_kernel_F1:


ands I, M, #3 ands I, M, #3
ble zgemv_n_kernel_F_END
ble .Lzgemv_n_kernel_F_END


zgemv_n_kernel_F10:
.Lzgemv_n_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne zgemv_n_kernel_F10
bne .Lzgemv_n_kernel_F10


zgemv_n_kernel_F_END:
.Lzgemv_n_kernel_F_END:


add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne zgemv_n_kernel_F_LOOP
bne .Lzgemv_n_kernel_F_LOOP


b zgemv_n_kernel_L999
b .Lzgemv_n_kernel_L999


zgemv_n_kernel_S_BEGIN:
.Lzgemv_n_kernel_S_BEGIN:


INIT_S INIT_S


zgemv_n_kernel_S_LOOP:
.Lzgemv_n_kernel_S_LOOP:
mov A_PTR, A mov A_PTR, A
mov Y_IPTR, Y mov Y_IPTR, Y
mov Y_OPTR, Y mov Y_OPTR, Y
@@ -430,9 +430,9 @@ zgemv_n_kernel_S_LOOP:


asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
ble zgemv_n_kernel_S1
ble .Lzgemv_n_kernel_S1


zgemv_n_kernel_S4:
.Lzgemv_n_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -440,27 +440,27 @@ zgemv_n_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zgemv_n_kernel_S4
bne .Lzgemv_n_kernel_S4


zgemv_n_kernel_S1:
.Lzgemv_n_kernel_S1:


ands I, M, #3 ands I, M, #3
ble zgemv_n_kernel_S_END
ble .Lzgemv_n_kernel_S_END


zgemv_n_kernel_S10:
.Lzgemv_n_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zgemv_n_kernel_S10
bne .Lzgemv_n_kernel_S10


zgemv_n_kernel_S_END:
.Lzgemv_n_kernel_S_END:


add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne zgemv_n_kernel_S_LOOP
bne .Lzgemv_n_kernel_S_LOOP


zgemv_n_kernel_L999:
.Lzgemv_n_kernel_L999:
RESTORE_REGS RESTORE_REGS


mov w0, wzr mov w0, wzr


+ 26
- 26
kernel/arm64/zgemv_t.S View File

@@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS SAVE_REGS


cmp N, xzr cmp N, xzr
ble zgemv_t_kernel_L999
ble .Lzgemv_t_kernel_L999
cmp M, xzr cmp M, xzr
ble zgemv_t_kernel_L999
ble .Lzgemv_t_kernel_L999


lsl LDA, LDA, #SHZ lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ lsl INC_Y, INC_Y, #SHZ
@@ -303,9 +303,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INIT INIT


cmp INC_X, #1 cmp INC_X, #1
bne zgemv_t_kernel_S_BEGIN
bne .Lzgemv_t_kernel_S_BEGIN


zgemv_t_kernel_F_LOOP:
.Lzgemv_t_kernel_F_LOOP:


mov A_PTR, A mov A_PTR, A
mov X_PTR, X mov X_PTR, X
@@ -314,30 +314,30 @@ zgemv_t_kernel_F_LOOP:


asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
beq zgemv_t_kernel_F1
beq .Lzgemv_t_kernel_F1


zgemv_t_kernel_F4:
.Lzgemv_t_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne zgemv_t_kernel_F4
bne .Lzgemv_t_kernel_F4


KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE


zgemv_t_kernel_F1:
.Lzgemv_t_kernel_F1:


ands I, M, #3 ands I, M, #3
ble zgemv_t_kernel_F_END
ble .Lzgemv_t_kernel_F_END


zgemv_t_kernel_F10:
.Lzgemv_t_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne zgemv_t_kernel_F10
bne .Lzgemv_t_kernel_F10


zgemv_t_kernel_F_END:
.Lzgemv_t_kernel_F_END:


#if !defined(DOUBLE) #if !defined(DOUBLE)
ld1 {v4.2s}, [Y] ld1 {v4.2s}, [Y]
@@ -355,15 +355,15 @@ zgemv_t_kernel_F_END:


add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne zgemv_t_kernel_F_LOOP
bne .Lzgemv_t_kernel_F_LOOP


b zgemv_t_kernel_L999
b .Lzgemv_t_kernel_L999


zgemv_t_kernel_S_BEGIN:
.Lzgemv_t_kernel_S_BEGIN:


INIT_S INIT_S


zgemv_t_kernel_S_LOOP:
.Lzgemv_t_kernel_S_LOOP:


mov A_PTR, A mov A_PTR, A
mov X_PTR, X mov X_PTR, X
@@ -371,9 +371,9 @@ zgemv_t_kernel_S_LOOP:


asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
ble zgemv_t_kernel_S1
ble .Lzgemv_t_kernel_S1


zgemv_t_kernel_S4:
.Lzgemv_t_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -381,21 +381,21 @@ zgemv_t_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zgemv_t_kernel_S4
bne .Lzgemv_t_kernel_S4


zgemv_t_kernel_S1:
.Lzgemv_t_kernel_S1:


ands I, M, #3 ands I, M, #3
ble zgemv_t_kernel_S_END
ble .Lzgemv_t_kernel_S_END


zgemv_t_kernel_S10:
.Lzgemv_t_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zgemv_t_kernel_S10
bne .Lzgemv_t_kernel_S10


zgemv_t_kernel_S_END:
.Lzgemv_t_kernel_S_END:


#if !defined(DOUBLE) #if !defined(DOUBLE)
ld1 {v4.2s}, [Y] ld1 {v4.2s}, [Y]
@@ -413,9 +413,9 @@ zgemv_t_kernel_S_END:


add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne zgemv_t_kernel_S_LOOP
bne .Lzgemv_t_kernel_S_LOOP


zgemv_t_kernel_L999:
.Lzgemv_t_kernel_L999:
RESTORE_REGS RESTORE_REGS
mov w0, wzr mov w0, wzr
ret ret


+ 16
- 16
kernel/arm64/znrm2.S View File

@@ -226,43 +226,43 @@ KERNEL_S1_END_\@:
INIT INIT


cmp N, #0 cmp N, #0
ble nrm2_kernel_L999
ble .Lznrm2_kernel_L999


cmp INC_X, #0 cmp INC_X, #0
beq nrm2_kernel_L999
beq .Lznrm2_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
bne .Lznrm2_kernel_S_BEGIN


nrm2_kernel_F_BEGIN:
.Lznrm2_kernel_F_BEGIN:


asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, xzr cmp I, xzr
ble nrm2_kernel_F1
ble .Lznrm2_kernel_F1


nrm2_kernel_F8:
.Lznrm2_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne nrm2_kernel_F8
bne .Lznrm2_kernel_F8


nrm2_kernel_F1:
.Lznrm2_kernel_F1:


ands I, N, #7 ands I, N, #7
ble nrm2_kernel_L999
ble .Lznrm2_kernel_L999




nrm2_kernel_F10:
.Lznrm2_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne nrm2_kernel_F10
bne .Lznrm2_kernel_F10


b nrm2_kernel_L999
b .Lznrm2_kernel_L999


nrm2_kernel_S_BEGIN:
.Lznrm2_kernel_S_BEGIN:


INIT_S INIT_S


@@ -270,15 +270,15 @@ nrm2_kernel_S_BEGIN:


.align 5 .align 5


nrm2_kernel_S10:
.Lznrm2_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne nrm2_kernel_S10
bne .Lznrm2_kernel_S10




nrm2_kernel_L999:
.Lznrm2_kernel_L999:
fsqrt SSQ, SSQ fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ fmul SSQ, SCALE, SSQ




+ 20
- 20
kernel/arm64/zrot.S View File

@@ -181,54 +181,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble rot_kernel_L999
ble .Lzrot_kernel_L999


INIT INIT


cmp INC_X, #1 cmp INC_X, #1
bne rot_kernel_S_BEGIN
bne .Lzrot_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne rot_kernel_S_BEGIN
bne .Lzrot_kernel_S_BEGIN


rot_kernel_F_BEGIN:
.Lzrot_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq rot_kernel_F1
beq .Lzrot_kernel_F1


KERNEL_INIT_F4 KERNEL_INIT_F4


rot_kernel_F4:
.Lzrot_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne rot_kernel_F4
bne .Lzrot_kernel_F4


rot_kernel_F1:
.Lzrot_kernel_F1:


ands I, N, #3 ands I, N, #3
ble rot_kernel_L999
ble .Lzrot_kernel_L999


rot_kernel_F10:
.Lzrot_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne rot_kernel_F10
bne .Lzrot_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


rot_kernel_S_BEGIN:
.Lzrot_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble rot_kernel_S1
ble .Lzrot_kernel_S1


rot_kernel_S4:
.Lzrot_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -236,21 +236,21 @@ rot_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne rot_kernel_S4
bne .Lzrot_kernel_S4


rot_kernel_S1:
.Lzrot_kernel_S1:


ands I, N, #3 ands I, N, #3
ble rot_kernel_L999
ble .Lzrot_kernel_L999


rot_kernel_S10:
.Lzrot_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne rot_kernel_S10
bne .Lzrot_kernel_S10


rot_kernel_L999:
.Lzrot_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret

+ 34
- 34
kernel/arm64/zscal.S View File

@@ -215,71 +215,71 @@ zscal_begin:
mov X_COPY, X mov X_COPY, X


cmp N, xzr cmp N, xzr
ble zscal_kernel_L999
ble .Lzscal_kernel_L999


fcmp DA_R, #0.0 fcmp DA_R, #0.0
bne zscal_kernel_R_non_zero
bne .Lzscal_kernel_R_non_zero


fcmp DA_I, #0.0 fcmp DA_I, #0.0
beq zscal_kernel_RI_zero
beq .Lzscal_kernel_RI_zero


b zscal_kernel_R_zero
b .Lzscal_kernel_R_zero


zscal_kernel_R_non_zero:
.Lzscal_kernel_R_non_zero:


fcmp DA_I, #0.0 fcmp DA_I, #0.0
beq zscal_kernel_I_zero
beq .Lzscal_kernel_I_zero


/******************************************************************************* /*******************************************************************************
* A_R != 0 && A_I != 0 * A_R != 0 && A_I != 0
*******************************************************************************/ *******************************************************************************/


zscal_kernel_RI_non_zero:
.Lzscal_kernel_RI_non_zero:


INIT INIT


cmp INC_X, #1 cmp INC_X, #1
bne zscal_kernel_S_BEGIN
bne .Lzscal_kernel_S_BEGIN


zscal_kernel_F_BEGIN:
.Lzscal_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq zscal_kernel_F1
beq .Lzscal_kernel_F1


KERNEL_INIT_F4 KERNEL_INIT_F4


zscal_kernel_F4:
.Lzscal_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne zscal_kernel_F4
bne .Lzscal_kernel_F4


zscal_kernel_F1:
.Lzscal_kernel_F1:


ands I, N, #3 ands I, N, #3
ble zscal_kernel_L999
ble .Lzscal_kernel_L999


zscal_kernel_F10:
.Lzscal_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne zscal_kernel_F10
bne .Lzscal_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


zscal_kernel_S_BEGIN:
.Lzscal_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble zscal_kernel_S1
ble .Lzscal_kernel_S1


zscal_kernel_S4:
.Lzscal_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -287,21 +287,21 @@ zscal_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zscal_kernel_S4
bne .Lzscal_kernel_S4


zscal_kernel_S1:
.Lzscal_kernel_S1:


ands I, N, #3 ands I, N, #3
ble zscal_kernel_L999
ble .Lzscal_kernel_L999


zscal_kernel_S10:
.Lzscal_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zscal_kernel_S10
bne .Lzscal_kernel_S10


zscal_kernel_L999:
.Lzscal_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret
@@ -310,7 +310,7 @@ zscal_kernel_L999:
* A_R == 0 && A_I != 0 * A_R == 0 && A_I != 0
*******************************************************************************/ *******************************************************************************/


zscal_kernel_R_zero:
.Lzscal_kernel_R_zero:
INIT_S INIT_S


#if !defined(DOUBLE) #if !defined(DOUBLE)
@@ -323,7 +323,7 @@ zscal_kernel_R_zero:
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
#endif #endif


zscal_kernel_R_zero_1:
.Lzscal_kernel_R_zero_1:
#if !defined(DOUBLE) #if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0 ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0
@@ -337,7 +337,7 @@ zscal_kernel_R_zero_1:
#endif #endif
add X, X, INC_X add X, X, INC_X
subs N, N, #1 subs N, N, #1
bne zscal_kernel_R_zero_1
bne .Lzscal_kernel_R_zero_1


mov w0, wzr mov w0, wzr
ret ret
@@ -346,7 +346,7 @@ zscal_kernel_R_zero_1:
* A_R != 0 && A_I == 0 * A_R != 0 && A_I == 0
*******************************************************************************/ *******************************************************************************/


zscal_kernel_I_zero:
.Lzscal_kernel_I_zero:
INIT_S INIT_S
#if !defined(DOUBLE) #if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
@@ -354,7 +354,7 @@ zscal_kernel_I_zero:
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
#endif #endif


zscal_kernel_I_zero_1:
.Lzscal_kernel_I_zero_1:
#if !defined(DOUBLE) #if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0 ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
@@ -366,7 +366,7 @@ zscal_kernel_I_zero_1:
#endif #endif
add X, X, INC_X add X, X, INC_X
subs N, N, #1 subs N, N, #1
bne zscal_kernel_I_zero_1
bne .Lzscal_kernel_I_zero_1


mov w0, wzr mov w0, wzr
ret ret
@@ -375,16 +375,16 @@ zscal_kernel_I_zero_1:
* A_R == 0 && A_I == 0 * A_R == 0 && A_I == 0
*******************************************************************************/ *******************************************************************************/


zscal_kernel_RI_zero:
.Lzscal_kernel_RI_zero:


INIT_S INIT_S


zscal_kernel_RI_zero_1:
.Lzscal_kernel_RI_zero_1:


stp DA_R, DA_I, [X] stp DA_R, DA_I, [X]
add X, X, INC_X add X, X, INC_X
subs N, N, #1 subs N, N, #1
bne zscal_kernel_RI_zero_1
bne .Lzscal_kernel_RI_zero_1


mov w0, wzr mov w0, wzr
ret ret


+ 130
- 130
kernel/arm64/ztrmm_kernel_4x4.S View File

@@ -1078,9 +1078,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble ztrmm_kernel_L2_BEGIN
ble .Lztrmm_kernel_L2_BEGIN


ztrmm_kernel_L4_BEGIN:
.Lztrmm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -1094,15 +1094,15 @@ ztrmm_kernel_L4_BEGIN:
#endif #endif
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


ztrmm_kernel_L4_M4_BEGIN:
.Lztrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble ztrmm_kernel_L4_M2_BEGIN
ble .Lztrmm_kernel_L4_M2_BEGIN


.align 5 .align 5
ztrmm_kernel_L4_M4_20:
.Lztrmm_kernel_L4_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -1123,7 +1123,7 @@ ztrmm_kernel_L4_M4_20:


asr counterL , tempK, #3 asr counterL , tempK, #3
cmp counterL , #2 cmp counterL , #2
blt ztrmm_kernel_L4_M4_32
blt .Lztrmm_kernel_L4_M4_32


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1135,10 +1135,10 @@ ztrmm_kernel_L4_M4_20:
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #2 subs counterL, counterL, #2
ble ztrmm_kernel_L4_M4_22a
ble .Lztrmm_kernel_L4_M4_22a


.align 5 .align 5
ztrmm_kernel_L4_M4_22:
.Lztrmm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1150,10 +1150,10 @@ ztrmm_kernel_L4_M4_22:
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M4_22
bgt .Lztrmm_kernel_L4_M4_22


.align 5 .align 5
ztrmm_kernel_L4_M4_22a:
.Lztrmm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1164,13 +1164,13 @@ ztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b ztrmm_kernel_L4_M4_44
b .Lztrmm_kernel_L4_M4_44


.align 5 .align 5
ztrmm_kernel_L4_M4_32:
.Lztrmm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble ztrmm_kernel_L4_M4_40
ble .Lztrmm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1181,26 +1181,26 @@ ztrmm_kernel_L4_M4_32:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b ztrmm_kernel_L4_M4_44
b .Lztrmm_kernel_L4_M4_44




ztrmm_kernel_L4_M4_40:
.Lztrmm_kernel_L4_M4_40:


INIT4x4 INIT4x4


ztrmm_kernel_L4_M4_44:
.Lztrmm_kernel_L4_M4_44:


ands counterL , tempK, #7 ands counterL , tempK, #7
ble ztrmm_kernel_L4_M4_100
ble .Lztrmm_kernel_L4_M4_100


.align 5 .align 5
ztrmm_kernel_L4_M4_46:
.Lztrmm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne ztrmm_kernel_L4_M4_46
bne .Lztrmm_kernel_L4_M4_46


ztrmm_kernel_L4_M4_100:
.Lztrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


@@ -1223,20 +1223,20 @@ ztrmm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


ztrmm_kernel_L4_M4_END:
.Lztrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne ztrmm_kernel_L4_M4_20
bne .Lztrmm_kernel_L4_M4_20


ztrmm_kernel_L4_M2_BEGIN:
.Lztrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ztrmm_kernel_L4_END
ble .Lztrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L4_M1_BEGIN
ble .Lztrmm_kernel_L4_M1_BEGIN


ztrmm_kernel_L4_M2_20:
.Lztrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1260,9 +1260,9 @@ ztrmm_kernel_L4_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L4_M2_40
ble .Lztrmm_kernel_L4_M2_40


ztrmm_kernel_L4_M2_22:
.Lztrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1275,22 +1275,22 @@ ztrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M2_22
bgt .Lztrmm_kernel_L4_M2_22




ztrmm_kernel_L4_M2_40:
.Lztrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L4_M2_100
ble .Lztrmm_kernel_L4_M2_100


ztrmm_kernel_L4_M2_42:
.Lztrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M2_42
bgt .Lztrmm_kernel_L4_M2_42


ztrmm_kernel_L4_M2_100:
.Lztrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -1310,15 +1310,15 @@ ztrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ztrmm_kernel_L4_M2_END:
.Lztrmm_kernel_L4_M2_END:




ztrmm_kernel_L4_M1_BEGIN:
.Lztrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L4_END
ble .Lztrmm_kernel_L4_END


ztrmm_kernel_L4_M1_20:
.Lztrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1342,9 +1342,9 @@ ztrmm_kernel_L4_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L4_M1_40
ble .Lztrmm_kernel_L4_M1_40


ztrmm_kernel_L4_M1_22:
.Lztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1356,22 +1356,22 @@ ztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M1_22
bgt .Lztrmm_kernel_L4_M1_22




ztrmm_kernel_L4_M1_40:
.Lztrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L4_M1_100
ble .Lztrmm_kernel_L4_M1_100


ztrmm_kernel_L4_M1_42:
.Lztrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M1_42
bgt .Lztrmm_kernel_L4_M1_42


ztrmm_kernel_L4_M1_100:
.Lztrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -1392,7 +1392,7 @@ ztrmm_kernel_L4_M1_100:
#endif #endif




ztrmm_kernel_L4_END:
.Lztrmm_kernel_L4_END:


lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2 add origPB, origPB, temp // B = B + K * 4 * 8 * 2
@@ -1402,19 +1402,19 @@ ztrmm_kernel_L4_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt ztrmm_kernel_L4_BEGIN
bgt .Lztrmm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble ztrmm_kernel_L999 // error, N was less than 4?
ble .Lztrmm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble ztrmm_kernel_L1_BEGIN
ble .Lztrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1426,14 +1426,14 @@ ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov pA, origPA // pA = A mov pA, origPA // pA = A


ztrmm_kernel_L2_M4_BEGIN:
.Lztrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble ztrmm_kernel_L2_M2_BEGIN
ble .Lztrmm_kernel_L2_M2_BEGIN


ztrmm_kernel_L2_M4_20:
.Lztrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1457,10 +1457,10 @@ ztrmm_kernel_L2_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ztrmm_kernel_L2_M4_40
ble .Lztrmm_kernel_L2_M4_40
.align 5 .align 5


ztrmm_kernel_L2_M4_22:
.Lztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1472,22 +1472,22 @@ ztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M4_22
bgt .Lztrmm_kernel_L2_M4_22




ztrmm_kernel_L2_M4_40:
.Lztrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M4_100
ble .Lztrmm_kernel_L2_M4_100


ztrmm_kernel_L2_M4_42:
.Lztrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M4_42
bgt .Lztrmm_kernel_L2_M4_42


ztrmm_kernel_L2_M4_100:
.Lztrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


@@ -1507,22 +1507,22 @@ ztrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ztrmm_kernel_L2_M4_END:
.Lztrmm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ztrmm_kernel_L2_M4_20
bgt .Lztrmm_kernel_L2_M4_20




ztrmm_kernel_L2_M2_BEGIN:
.Lztrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ztrmm_kernel_L2_END
ble .Lztrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L2_M1_BEGIN
ble .Lztrmm_kernel_L2_M1_BEGIN


ztrmm_kernel_L2_M2_20:
.Lztrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1546,9 +1546,9 @@ ztrmm_kernel_L2_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ztrmm_kernel_L2_M2_40
ble .Lztrmm_kernel_L2_M2_40


ztrmm_kernel_L2_M2_22:
.Lztrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1561,22 +1561,22 @@ ztrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M2_22
bgt .Lztrmm_kernel_L2_M2_22




ztrmm_kernel_L2_M2_40:
.Lztrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M2_100
ble .Lztrmm_kernel_L2_M2_100


ztrmm_kernel_L2_M2_42:
.Lztrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M2_42
bgt .Lztrmm_kernel_L2_M2_42


ztrmm_kernel_L2_M2_100:
.Lztrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


@@ -1596,15 +1596,15 @@ ztrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ztrmm_kernel_L2_M2_END:
.Lztrmm_kernel_L2_M2_END:




ztrmm_kernel_L2_M1_BEGIN:
.Lztrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L2_END
ble .Lztrmm_kernel_L2_END


ztrmm_kernel_L2_M1_20:
.Lztrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1628,9 +1628,9 @@ ztrmm_kernel_L2_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble ztrmm_kernel_L2_M1_40
ble .Lztrmm_kernel_L2_M1_40


ztrmm_kernel_L2_M1_22:
.Lztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1642,22 +1642,22 @@ ztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M1_22
bgt .Lztrmm_kernel_L2_M1_22




ztrmm_kernel_L2_M1_40:
.Lztrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M1_100
ble .Lztrmm_kernel_L2_M1_100


ztrmm_kernel_L2_M1_42:
.Lztrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M1_42
bgt .Lztrmm_kernel_L2_M1_42


ztrmm_kernel_L2_M1_100:
.Lztrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -1678,7 +1678,7 @@ ztrmm_kernel_L2_M1_100:
#endif #endif




ztrmm_kernel_L2_END:
.Lztrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -1688,11 +1688,11 @@ ztrmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


ztrmm_kernel_L1_BEGIN:
.Lztrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble ztrmm_kernel_L999 // done
ble .Lztrmm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1706,14 +1706,14 @@ ztrmm_kernel_L1_BEGIN:






ztrmm_kernel_L1_M4_BEGIN:
.Lztrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble ztrmm_kernel_L1_M2_BEGIN
ble .Lztrmm_kernel_L1_M2_BEGIN


ztrmm_kernel_L1_M4_20:
.Lztrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -1737,10 +1737,10 @@ ztrmm_kernel_L1_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L1_M4_40
ble .Lztrmm_kernel_L1_M4_40
.align 5 .align 5


ztrmm_kernel_L1_M4_22:
.Lztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1752,22 +1752,22 @@ ztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M4_22
bgt .Lztrmm_kernel_L1_M4_22




ztrmm_kernel_L1_M4_40:
.Lztrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M4_100
ble .Lztrmm_kernel_L1_M4_100


ztrmm_kernel_L1_M4_42:
.Lztrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M4_42
bgt .Lztrmm_kernel_L1_M4_42


ztrmm_kernel_L1_M4_100:
.Lztrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


@@ -1787,22 +1787,22 @@ ztrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ztrmm_kernel_L1_M4_END:
.Lztrmm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ztrmm_kernel_L1_M4_20
bgt .Lztrmm_kernel_L1_M4_20




ztrmm_kernel_L1_M2_BEGIN:
.Lztrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ztrmm_kernel_L1_END
ble .Lztrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L1_M1_BEGIN
ble .Lztrmm_kernel_L1_M1_BEGIN


ztrmm_kernel_L1_M2_20:
.Lztrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1826,9 +1826,9 @@ ztrmm_kernel_L1_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L1_M2_40
ble .Lztrmm_kernel_L1_M2_40


ztrmm_kernel_L1_M2_22:
.Lztrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1841,22 +1841,22 @@ ztrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M2_22
bgt .Lztrmm_kernel_L1_M2_22




ztrmm_kernel_L1_M2_40:
.Lztrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M2_100
ble .Lztrmm_kernel_L1_M2_100


ztrmm_kernel_L1_M2_42:
.Lztrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M2_42
bgt .Lztrmm_kernel_L1_M2_42


ztrmm_kernel_L1_M2_100:
.Lztrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -1876,15 +1876,15 @@ ztrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ztrmm_kernel_L1_M2_END:
.Lztrmm_kernel_L1_M2_END:




ztrmm_kernel_L1_M1_BEGIN:
.Lztrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L1_END
ble .Lztrmm_kernel_L1_END


ztrmm_kernel_L1_M1_20:
.Lztrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1908,9 +1908,9 @@ ztrmm_kernel_L1_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L1_M1_40
ble .Lztrmm_kernel_L1_M1_40


ztrmm_kernel_L1_M1_22:
.Lztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1922,30 +1922,30 @@ ztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M1_22
bgt .Lztrmm_kernel_L1_M1_22




ztrmm_kernel_L1_M1_40:
.Lztrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M1_100
ble .Lztrmm_kernel_L1_M1_100


ztrmm_kernel_L1_M1_42:
.Lztrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M1_42
bgt .Lztrmm_kernel_L1_M1_42


ztrmm_kernel_L1_M1_100:
.Lztrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




ztrmm_kernel_L1_END:
.Lztrmm_kernel_L1_END:




ztrmm_kernel_L999:
.Lztrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


Loading…
Cancel
Save