|
|
|
@@ -340,7 +340,7 @@ |
|
|
|
|
|
|
|
vmovsd %xmm0, ALPHA
|
|
|
|
|
|
|
|
salq $BASE_SHIFT, LDC
|
|
|
|
salq $BASE_SHIFT, LDC # LDC << 3 # LDC * 8
|
|
|
|
|
|
|
|
#ifdef TRMMKERNEL
|
|
|
|
vmovsd %xmm12, OFFSET
|
|
|
|
@@ -350,7 +350,7 @@ |
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
movq N, J
|
|
|
|
sarq $2, J # j = (n >> 2)
|
|
|
|
sarq $2, J # j = (n >> 2) # j = n / 4
|
|
|
|
jle .L40
|
|
|
|
ALIGN_4
|
|
|
|
|
|
|
|
@@ -434,104 +434,6 @@ |
|
|
|
#define PR2 24
|
|
|
|
|
|
|
|
.L12:
|
|
|
|
prefetcht0 PR1*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR1*SIZE(BO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(BO,%rax,4)
|
|
|
|
KERNEL1(16 * 0)
|
|
|
|
KERNEL2(16 * 0)
|
|
|
|
KERNEL3(16 * 0)
|
|
|
|
KERNEL4(16 * 0)
|
|
|
|
KERNEL5(16 * 0)
|
|
|
|
KERNEL6(16 * 0)
|
|
|
|
KERNEL7(16 * 0)
|
|
|
|
KERNEL8(16 * 0)
|
|
|
|
NOBRANCH
|
|
|
|
je .L15
|
|
|
|
prefetcht0 PR1*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR1*SIZE(BO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(BO,%rax,4)
|
|
|
|
KERNEL1(16 * 0)
|
|
|
|
KERNEL2(16 * 0)
|
|
|
|
KERNEL3(16 * 0)
|
|
|
|
KERNEL4(16 * 0)
|
|
|
|
KERNEL5(16 * 0)
|
|
|
|
KERNEL6(16 * 0)
|
|
|
|
KERNEL7(16 * 0)
|
|
|
|
KERNEL8(16 * 0)
|
|
|
|
NOBRANCH
|
|
|
|
je .L15
|
|
|
|
prefetcht0 PR1*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR1*SIZE(BO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(BO,%rax,4)
|
|
|
|
KERNEL1(16 * 0)
|
|
|
|
KERNEL2(16 * 0)
|
|
|
|
KERNEL3(16 * 0)
|
|
|
|
KERNEL4(16 * 0)
|
|
|
|
KERNEL5(16 * 0)
|
|
|
|
KERNEL6(16 * 0)
|
|
|
|
KERNEL7(16 * 0)
|
|
|
|
KERNEL8(16 * 0)
|
|
|
|
NOBRANCH
|
|
|
|
je .L15
|
|
|
|
prefetcht0 PR1*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR1*SIZE(BO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(BO,%rax,4)
|
|
|
|
KERNEL1(16 * 0)
|
|
|
|
KERNEL2(16 * 0)
|
|
|
|
KERNEL3(16 * 0)
|
|
|
|
KERNEL4(16 * 0)
|
|
|
|
KERNEL5(16 * 0)
|
|
|
|
KERNEL6(16 * 0)
|
|
|
|
KERNEL7(16 * 0)
|
|
|
|
KERNEL8(16 * 0)
|
|
|
|
NOBRANCH
|
|
|
|
je .L15
|
|
|
|
prefetcht0 PR1*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR1*SIZE(BO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(BO,%rax,4)
|
|
|
|
KERNEL1(16 * 0)
|
|
|
|
KERNEL2(16 * 0)
|
|
|
|
KERNEL3(16 * 0)
|
|
|
|
KERNEL4(16 * 0)
|
|
|
|
KERNEL5(16 * 0)
|
|
|
|
KERNEL6(16 * 0)
|
|
|
|
KERNEL7(16 * 0)
|
|
|
|
KERNEL8(16 * 0)
|
|
|
|
NOBRANCH
|
|
|
|
je .L15
|
|
|
|
prefetcht0 PR1*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR1*SIZE(BO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(BO,%rax,4)
|
|
|
|
KERNEL1(16 * 0)
|
|
|
|
KERNEL2(16 * 0)
|
|
|
|
KERNEL3(16 * 0)
|
|
|
|
KERNEL4(16 * 0)
|
|
|
|
KERNEL5(16 * 0)
|
|
|
|
KERNEL6(16 * 0)
|
|
|
|
KERNEL7(16 * 0)
|
|
|
|
KERNEL8(16 * 0)
|
|
|
|
NOBRANCH
|
|
|
|
je .L15
|
|
|
|
prefetcht0 PR1*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR1*SIZE(BO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(BO,%rax,4)
|
|
|
|
KERNEL1(16 * 0)
|
|
|
|
KERNEL2(16 * 0)
|
|
|
|
KERNEL3(16 * 0)
|
|
|
|
KERNEL4(16 * 0)
|
|
|
|
KERNEL5(16 * 0)
|
|
|
|
KERNEL6(16 * 0)
|
|
|
|
KERNEL7(16 * 0)
|
|
|
|
KERNEL8(16 * 0)
|
|
|
|
NOBRANCH
|
|
|
|
je .L15
|
|
|
|
prefetcht0 PR1*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR2*SIZE(AO,%rax,4)
|
|
|
|
prefetcht0 PR1*SIZE(BO,%rax,4)
|
|
|
|
@@ -986,15 +888,15 @@ |
|
|
|
jg .L01
|
|
|
|
ALIGN_4
|
|
|
|
|
|
|
|
.L40:
|
|
|
|
testq $3, N
|
|
|
|
je .L999
|
|
|
|
.L40: # N % 4
|
|
|
|
testq $3, N # N % 4 == 3
|
|
|
|
je .L999 # Jump to end if N % 4 == 0
|
|
|
|
|
|
|
|
testq $2, N
|
|
|
|
testq $2, N # N % 4 == 2
|
|
|
|
je .L80
|
|
|
|
ALIGN_4
|
|
|
|
|
|
|
|
.L41:
|
|
|
|
.L41: # N % 4 > 1
|
|
|
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
|
|
|
movq OFFSET, %rax
|
|
|
|
movq %rax, KK
|
|
|
|
@@ -1002,14 +904,14 @@ |
|
|
|
|
|
|
|
movq C, CO1 # coffset1 = c
|
|
|
|
leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
|
|
|
|
movq A, AO # aoffset = a
|
|
|
|
movq A, AO # aoffset = a
|
|
|
|
|
|
|
|
movq K, %rax
|
|
|
|
salq $BASE_SHIFT + 1, %rax
|
|
|
|
salq $BASE_SHIFT + 1, %rax # k << 4
|
|
|
|
leaq (B, %rax), BB
|
|
|
|
|
|
|
|
movq M, I
|
|
|
|
sarq $2, I # i = (m >> 2)
|
|
|
|
sarq $2, I # i = (m >> 2)
|
|
|
|
jle .L60
|
|
|
|
ALIGN_4
|
|
|
|
|
|
|
|
@@ -1063,12 +965,12 @@ |
|
|
|
je .L56
|
|
|
|
ALIGN_4
|
|
|
|
|
|
|
|
.L52:
|
|
|
|
.L52: # Loop for (N % 4) == 2
|
|
|
|
vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8
|
|
|
|
vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9
|
|
|
|
vmovups -14 * SIZE(AO, %rax, 4),%xmm2
|
|
|
|
vmovups -14 * SIZE(AO, %rax, 4),%xmm2
|
|
|
|
vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12
|
|
|
|
vmovups -12 * SIZE(AO, %rax, 4), %xmm0
|
|
|
|
vmovups -12 * SIZE(AO, %rax, 4), %xmm0
|
|
|
|
vmovddup -14 * SIZE(BO, %rax, 2), %xmm1
|
|
|
|
vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13
|
|
|
|
vmovddup -13 * SIZE(BO, %rax, 2), %xmm5
|
|
|
|
@@ -1076,15 +978,15 @@ |
|
|
|
vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8
|
|
|
|
vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12
|
|
|
|
vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9
|
|
|
|
vmovups (AO, %rax, 4), %xmm0
|
|
|
|
vmovddup -8 * SIZE(BO, %rax, 2), %xmm1
|
|
|
|
vmovups (AO, %rax, 4), %xmm0
|
|
|
|
vmovddup -8 * SIZE(BO, %rax, 2), %xmm1
|
|
|
|
vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13
|
|
|
|
vmovddup -11 * SIZE(BO, %rax, 2), %xmm5
|
|
|
|
vmovups -6 * SIZE(AO, %rax, 4), %xmm2
|
|
|
|
vmovups -6 * SIZE(AO, %rax, 4), %xmm2
|
|
|
|
vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8
|
|
|
|
vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12
|
|
|
|
vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9
|
|
|
|
vmovups -4 * SIZE(AO, %rax, 4), %xmm4
|
|
|
|
vmovups -4 * SIZE(AO, %rax, 4), %xmm4
|
|
|
|
vmovddup -10 * SIZE(BO, %rax, 2), %xmm3
|
|
|
|
vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13
|
|
|
|
vmovddup -9 * SIZE(BO, %rax, 2), %xmm5
|
|
|
|
@@ -1093,7 +995,7 @@ |
|
|
|
vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12
|
|
|
|
vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9
|
|
|
|
vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13
|
|
|
|
vmovups 8 * SIZE(AO, %rax, 4), %xmm4
|
|
|
|
vmovups 8 * SIZE(AO, %rax, 4), %xmm4
|
|
|
|
vmovddup -4 * SIZE(BO, %rax, 2), %xmm3
|
|
|
|
vmovddup -7 * SIZE(BO, %rax, 2), %xmm5
|
|
|
|
vmovaps %xmm0, %xmm2
|
|
|
|
@@ -1455,8 +1357,8 @@ |
|
|
|
ALIGN_4
|
|
|
|
|
|
|
|
.L80:
|
|
|
|
testq $1, N
|
|
|
|
je .L999
|
|
|
|
testq $1, N # N % 4 == 1
|
|
|
|
je .L999 # Jump to end if N % 4 == 0
|
|
|
|
ALIGN_4
|
|
|
|
|
|
|
|
.L81:
|
|
|
|
|