Browse Source

Fix inline assembly constraints in Bulldozer TRSM kernels

rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009
tags/v0.3.6^2
Martin Kroeker GitHub 7 years ago
parent
commit
f9bb76d29a
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 356 additions and 356 deletions
  1. +48
    -48
      kernel/x86_64/dtrsm_kernel_RT_bulldozer.c
  2. +126
    -126
      kernel/x86_64/strsm_kernel_LN_bulldozer.c
  3. +128
    -128
      kernel/x86_64/strsm_kernel_LT_bulldozer.c
  4. +27
    -27
      kernel/x86_64/strsm_kernel_RN_bulldozer.c
  5. +27
    -27
      kernel/x86_64/strsm_kernel_RT_bulldozer.c

+ 48
- 48
kernel/x86_64/dtrsm_kernel_RT_bulldozer.c View File

@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"

" prefetcht0 384(%2,%1,8) \n\t"
" prefetcht0 384(%3,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" prefetcht0 384(%7,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"

" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

" jz 2f \n\t"

" prefetcht0 384(%2,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"

" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

" jz 2f \n\t"

" prefetcht0 384(%2,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"

" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

" jz 2f \n\t"

" prefetcht0 384(%2,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"

" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

"3: \n\t" // i = 1

" vmovddup (%7), %%xmm1 \n\t" // read b
" vmovddup 8(%7), %%xmm0 \n\t" // read bb
" vmovddup (%3), %%xmm1 \n\t" // read b
" vmovddup 8(%3), %%xmm0 \n\t" // read bb

" vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
" vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
" vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
" vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb

" vmovups %%xmm12 , (%6) \n\t" // write a
" vmovups %%xmm13 , 16(%6) \n\t" // write a
" vmovups %%xmm14 , 32(%6) \n\t" // write a
" vmovups %%xmm15 , 48(%6) \n\t" // write a
" vmovups %%xmm12 , (%2) \n\t" // write a
" vmovups %%xmm13 , 16(%2) \n\t" // write a
" vmovups %%xmm14 , 32(%2) \n\t" // write a
" vmovups %%xmm15 , 48(%2) \n\t" // write a

" vmovups %%xmm12 , (%5) \n\t" // write c1
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"

" \n\t" // i = 0
" subq $16 , %7 \n\t" // b = b - 2
" subq $64 , %6 \n\t" // a = a - 8
" subq $16 , %3 \n\t" // b = b - 2
" subq $64 , %2 \n\t" // a = a - 8

" vmovddup (%7), %%xmm0 \n\t" // read bb
" vmovddup (%3), %%xmm0 \n\t" // read bb

" vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
" vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t"
" vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t"
" vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t"

" vmovups %%xmm8 , (%6) \n\t" // write a
" vmovups %%xmm9 , 16(%6) \n\t"
" vmovups %%xmm10 , 32(%6) \n\t"
" vmovups %%xmm11 , 48(%6) \n\t"
" vmovups %%xmm8 , (%2) \n\t" // write a
" vmovups %%xmm9 , 16(%2) \n\t"
" vmovups %%xmm10 , 32(%2) \n\t"
" vmovups %%xmm11 , 48(%2) \n\t"

" vmovups %%xmm8 , (%4) \n\t" // write c0
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vzeroupper \n\t"

:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
"r" (as), // 6
"r" (bs) // 7
"r" (a), // 6
"r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",


+ 126
- 126
kernel/x86_64/strsm_kernel_LN_bulldozer.c View File

@@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"

" vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"

" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

"3: \n\t"

" vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i]
" vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i]
" vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i]
" vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i]
" vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i]
" vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i]
" vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i]
" vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i]
" vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i]
" vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i]
" vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i]
" vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i]
" vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i]
" vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i]
" vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i]
" vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i]
" vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i]
" vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i]
" vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i]
" vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i]
" vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i]
" vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i]
" vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i]
" vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i]
" vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i]
" vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i]
" vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i]
" vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i]
" vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i]
" vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i]
" vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"

" subq $64 , %6 \n\t" // a -= m
" subq $8 , %7 \n\t" // b -= n
" subq $64 , %2 \n\t" // a -= m
" subq $8 , %3 \n\t" // b -= n

" vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i]
" vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i]
" vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vzeroupper \n\t"

:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
"r" (as), // 6
"r" (bs) // 7
"r" (a), // 6
"r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",


+ 128
- 128
kernel/x86_64/strsm_kernel_LT_bulldozer.c View File

@@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"

" vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"

" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

"3: \n\t"

" vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i]
" vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i]
" vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i]
" vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i]
" vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i]
" vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i]
" vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 0(%6) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 0(%2) , %%xmm4 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t"
" vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t"
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
@@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i]
" vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i]
" vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
@@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i]
" vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i]
" vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
@@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i]
" vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i]
" vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
@@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i]
" vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i]
" vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 16(%6) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 16(%2) , %%xmm5 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t"
" vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t"
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
@@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i]
" vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i]
" vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i]
" vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i]
" vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i]
" vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i]
" vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i]
" vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i]
" vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 32(%6) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 32(%2) , %%xmm6 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t"
" vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t"
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i]
" vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i]
" vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i]
" vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i]
" vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i]
" vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i]
" vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i]
" vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i]
" vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vmovups 48(%6) , %%xmm7 \n\t" // read a[k]
" vmovups 48(%2) , %%xmm7 \n\t" // read a[k]
" vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t"
" vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t"

" addq $64 , %6 \n\t" // a -= m
" addq $8 , %7 \n\t" // b -= n
" addq $64 , %2 \n\t" // a -= m
" addq $8 , %3 \n\t" // b -= n

" vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i]
" vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i]
" vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0
" vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1
" vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa
" vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa
" vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa
" vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa
" vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa
" vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa
" vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa

" vzeroupper \n\t"

:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
"r" (as), // 6
"r" (bs) // 7
"r" (c), // 4
"r" (c1), // 5
"r" (a), // 6
"r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",


+ 27
- 27
kernel/x86_64/strsm_kernel_RN_bulldozer.c View File

@@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"

" vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"

" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

"3: \n\t" // i = 0

" vbroadcastss (%7), %%xmm0 \n\t" // read bb
" vbroadcastss 4(%7), %%xmm1 \n\t" // read b
" vbroadcastss (%3), %%xmm0 \n\t" // read bb
" vbroadcastss 4(%3), %%xmm1 \n\t" // read b

" vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
" vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t"
" vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t"
" vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t"

" vmovups %%xmm8 , (%6) \n\t" // write a
" vmovups %%xmm9 , 16(%6) \n\t"
" vmovups %%xmm10 , 32(%6) \n\t"
" vmovups %%xmm11 , 48(%6) \n\t"
" vmovups %%xmm8 , (%2) \n\t" // write a
" vmovups %%xmm9 , 16(%2) \n\t"
" vmovups %%xmm10 , 32(%2) \n\t"
" vmovups %%xmm11 , 48(%2) \n\t"

" vmovups %%xmm8 , (%4) \n\t" // write c0
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t"

" \n\t" // i = 1
" addq $8 , %7 \n\t" // b = b + 2
" addq $64 , %6 \n\t" // a = a + 16
" addq $8 , %3 \n\t" // b = b + 2
" addq $64 , %2 \n\t" // a = a + 16

" vbroadcastss 4(%7), %%xmm0 \n\t" // read bb
" vbroadcastss 4(%3), %%xmm0 \n\t" // read bb

" vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
" vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
" vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
" vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb

" vmovups %%xmm12 , (%6) \n\t" // write a
" vmovups %%xmm13 , 16(%6) \n\t" // write a
" vmovups %%xmm14 , 32(%6) \n\t" // write a
" vmovups %%xmm15 , 48(%6) \n\t" // write a
" vmovups %%xmm12 , (%2) \n\t" // write a
" vmovups %%xmm13 , 16(%2) \n\t" // write a
" vmovups %%xmm14 , 32(%2) \n\t" // write a
" vmovups %%xmm15 , 48(%2) \n\t" // write a

" vmovups %%xmm12 , (%5) \n\t" // write c1
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vzeroupper \n\t"

:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
"r" (as), // 6
"r" (bs) // 7
"r" (c), // 4
"r" (c1), // 5
"r" (a), // 6
"r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",


+ 27
- 27
kernel/x86_64/strsm_kernel_RT_bulldozer.c View File

@@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"

" vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%3,%1,1), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vbroadcastss 4(%7,%1,1), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"

" vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

"3: \n\t" // i = 1

" vbroadcastss (%7), %%xmm1 \n\t" // read b
" vbroadcastss 4(%7), %%xmm0 \n\t" // read bb
" vbroadcastss (%3), %%xmm1 \n\t" // read b
" vbroadcastss 4(%3), %%xmm0 \n\t" // read bb

" vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
" vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
" vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
" vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb

" vmovups %%xmm12 , (%6) \n\t" // write a
" vmovups %%xmm13 , 16(%6) \n\t" // write a
" vmovups %%xmm14 , 32(%6) \n\t" // write a
" vmovups %%xmm15 , 48(%6) \n\t" // write a
" vmovups %%xmm12 , (%2) \n\t" // write a
" vmovups %%xmm13 , 16(%2) \n\t" // write a
" vmovups %%xmm14 , 32(%2) \n\t" // write a
" vmovups %%xmm15 , 48(%2) \n\t" // write a

" vmovups %%xmm12 , (%5) \n\t" // write c1
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"

" \n\t" // i = 0
" subq $8 , %7 \n\t" // b = b - 2
" subq $64 , %6 \n\t" // a = a - 16
" subq $8 , %3 \n\t" // b = b - 2
" subq $64 , %2 \n\t" // a = a - 16

" vbroadcastss (%7), %%xmm0 \n\t" // read bb
" vbroadcastss (%3), %%xmm0 \n\t" // read bb

" vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
" vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t"
" vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t"
" vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t"

" vmovups %%xmm8 , (%6) \n\t" // write a
" vmovups %%xmm9 , 16(%6) \n\t"
" vmovups %%xmm10 , 32(%6) \n\t"
" vmovups %%xmm11 , 48(%6) \n\t"
" vmovups %%xmm8 , (%2) \n\t" // write a
" vmovups %%xmm9 , 16(%2) \n\t"
" vmovups %%xmm10 , 32(%2) \n\t"
" vmovups %%xmm11 , 48(%2) \n\t"

" vmovups %%xmm8 , (%4) \n\t" // write c0
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vzeroupper \n\t"

:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
"r" (as), // 6
"r" (bs) // 7
"r" (c), // 4
"r" (c1), // 5
"r" (a), // 6
"r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",


Loading…
Cancel
Save