| @@ -277,7 +277,7 @@ LEAQ (, %rax, SIZE), %rax; | |||
| LEAQ (ptrba, %rax, 8), ptrba; | |||
| LEAQ (ptrbb, %rax, 4), ptrbb; | |||
| #endif | |||
| #### Initial Results Register #### | |||
| //#### Initial Results Register #### | |||
| PREFETCH2 0*SIZE(prebb); | |||
| XOR_DY yvec15, yvec15, yvec15; | |||
| PREFETCH2 8*SIZE(prebb); | |||
| @@ -317,7 +317,7 @@ ALIGN_5; | |||
| .L2_bodyB:; | |||
| # Computing kernel | |||
| #### Unroll times 1 #### | |||
| //#### Unroll times 1 #### | |||
| LD_DY 4*SIZE(ptrba), yvec1; | |||
| MUL_DY yvec0, yvec2, yvec6; | |||
| SHUF_DY $0x03, yvec2, yvec2, yvec4; | |||
| @@ -345,7 +345,7 @@ MUL_DY yvec1, yvec5, yvec7; | |||
| ADD_DY yvec10, yvec6, yvec10; | |||
| ADD_DY yvec8, yvec7, yvec8; | |||
| #### Unroll times 2 #### | |||
| //#### Unroll times 2 #### | |||
| LD_DY 12*SIZE(ptrba), yvec1; | |||
| MUL_DY yvec0, yvec2, yvec6; | |||
| SHUF_DY $0x03, yvec2, yvec2, yvec4; | |||
| @@ -373,7 +373,7 @@ MUL_DY yvec1, yvec5, yvec7; | |||
| ADD_DY yvec10, yvec6, yvec10; | |||
| ADD_DY yvec8, yvec7, yvec8; | |||
| #### Unroll times 3 #### | |||
| //#### Unroll times 3 #### | |||
| LD_DY 20*SIZE(ptrba), yvec1; | |||
| MUL_DY yvec0, yvec2, yvec6; | |||
| SHUF_DY $0x03, yvec2, yvec2, yvec4; | |||
| @@ -402,7 +402,7 @@ MUL_DY yvec1, yvec5, yvec7; | |||
| ADD_DY yvec10, yvec6, yvec10; | |||
| ADD_DY yvec8, yvec7, yvec8; | |||
| #### Unroll times 4 #### | |||
| //#### Unroll times 4 #### | |||
| LD_DY 28*SIZE(ptrba), yvec1; | |||
| MUL_DY yvec0, yvec2, yvec6; | |||
| SHUF_DY $0x03, yvec2, yvec2, yvec4; | |||
| @@ -446,7 +446,7 @@ TEST $2, %rax; | |||
| JLE .L3_loopE; | |||
| ALIGN_5 | |||
| .L3_bodyB: | |||
| #### Unroll times 1 #### | |||
| //#### Unroll times 1 #### | |||
| PREFETCH0 64*SIZE(ptrba) | |||
| LD_DY 4*SIZE(ptrba), yvec1; | |||
| MUL_DY yvec0, yvec2, yvec6; | |||
| @@ -475,7 +475,7 @@ MUL_DY yvec1, yvec5, yvec7; | |||
| ADD_DY yvec10, yvec6, yvec10; | |||
| ADD_DY yvec8, yvec7, yvec8; | |||
| #### Unroll times 2 #### | |||
| //#### Unroll times 2 #### | |||
| PREFETCH0 72*SIZE(ptrba) | |||
| LD_DY 12*SIZE(ptrba), yvec1; | |||
| MUL_DY yvec0, yvec2, yvec6; | |||
| @@ -516,7 +516,7 @@ TEST $1, %rax; | |||
| JLE .L4_loopE; | |||
| ALIGN_5 | |||
| .L4_bodyB:; | |||
| #### Unroll times 1 #### | |||
| //#### Unroll times 1 #### | |||
| PREFETCH0 64*SIZE(ptrba) | |||
| LD_DY 4*SIZE(ptrba), yvec1; | |||
| MUL_DY yvec0, yvec2, yvec6; | |||
| @@ -544,9 +544,9 @@ ADD_DY yvec10, yvec6, yvec10; | |||
| ADD_DY yvec8, yvec7, yvec8; | |||
| .L4_loopE:; | |||
| #### Load Alpha #### | |||
| //#### Load Alpha #### | |||
| BROAD_DY MEMALPHA,yvec7; | |||
| #### Multiply Alpha #### | |||
| //#### Multiply Alpha #### | |||
| MUL_DY yvec7,yvec15,yvec15; | |||
| MUL_DY yvec7,yvec14,yvec14; | |||
| MUL_DY yvec7,yvec13,yvec13; | |||
| @@ -555,7 +555,7 @@ MUL_DY yvec7,yvec11,yvec11; | |||
| MUL_DY yvec7,yvec10,yvec10; | |||
| MUL_DY yvec7,yvec9,yvec9; | |||
| MUL_DY yvec7,yvec8,yvec8; | |||
| #### Reverse the Results #### | |||
| //#### Reverse the Results #### | |||
| MOV_DY yvec15,yvec7; | |||
| REVS_DY $0x0a,yvec13,yvec15,yvec15; | |||
| REVS_DY $0x0a,yvec7,yvec13,yvec13; | |||
| @@ -568,13 +568,13 @@ REVS_DY $0x0a,yvec7,yvec9,yvec9; | |||
| MOV_DY yvec10,yvec7; | |||
| REVS_DY $0x0a,yvec8,yvec10,yvec10; | |||
| REVS_DY $0x0a,yvec7,yvec8,yvec8; | |||
| #### Testing alignment #### | |||
| //#### Testing alignment #### | |||
| MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L4_loopEx; # Unalign part write back | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| //#### Writing Back #### | |||
| EXTRA_DY $1,yvec15,xvec7; | |||
| EXTRA_DY $1,yvec14,xvec6; | |||
| EXTRA_DY $1,yvec13,xvec5; | |||
| @@ -776,7 +776,7 @@ LEAQ (, %rax, SIZE), %rax; | |||
| LEAQ (ptrba, %rax, 4), ptrba; | |||
| LEAQ (ptrbb, %rax, 4), ptrbb; | |||
| #endif | |||
| #### Initial Results Register #### | |||
| //#### Initial Results Register #### | |||
| XOR_DY yvec15, yvec15, yvec15; | |||
| XOR_DY yvec13, yvec13, yvec13; | |||
| LD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -805,7 +805,7 @@ ALIGN_5; | |||
| .L6_bodyB:; | |||
| # Computing kernel | |||
| #### Untoll time 1 #### | |||
| //#### Untoll time 1 #### | |||
| LD_DY 4*SIZE(ptrba), yvec1; | |||
| MUL_DY yvec0, yvec2, yvec6; | |||
| ADD_DY yvec15, yvec6, yvec15; | |||
| @@ -821,7 +821,7 @@ VPERMILP_DY $0x05, yvec2, yvec3; | |||
| MUL_DY yvec0, yvec5, yvec7; | |||
| ADD_DY yvec9, yvec7, yvec9; | |||
| #### Untoll time 2 #### | |||
| //#### Untoll time 2 #### | |||
| LD_DY 8*SIZE(ptrba), yvec0; | |||
| MUL_DY yvec1, yvec2, yvec6; | |||
| ADD_DY yvec15, yvec6, yvec15; | |||
| @@ -837,7 +837,7 @@ VPERMILP_DY $0x05, yvec2, yvec3; | |||
| MUL_DY yvec1, yvec5, yvec7; | |||
| ADD_DY yvec9, yvec7, yvec9; | |||
| #### Untoll time 3 #### | |||
| //#### Untoll time 3 #### | |||
| LD_DY 12*SIZE(ptrba), yvec1; | |||
| MUL_DY yvec0, yvec2, yvec6; | |||
| ADD_DY yvec15, yvec6, yvec15; | |||
| @@ -855,7 +855,7 @@ ADDQ $16*SIZE, ptrbb; | |||
| MUL_DY yvec0, yvec5, yvec7; | |||
| ADD_DY yvec9, yvec7, yvec9; | |||
| #### Untoll time 4 #### | |||
| //#### Untoll time 4 #### | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| MUL_DY yvec1, yvec2, yvec6; | |||
| ADD_DY yvec15, yvec6, yvec15; | |||
| @@ -883,7 +883,7 @@ TEST $2, %rax; | |||
| JLE .L7_loopE; | |||
| ALIGN_5 | |||
| .L7_bodyB:; | |||
| #### Untoll time 1 #### | |||
| //#### Untoll time 1 #### | |||
| LD_DY 4*SIZE(ptrba), yvec1; | |||
| MUL_DY yvec0, yvec2, yvec6; | |||
| ADD_DY yvec15, yvec6, yvec15; | |||
| @@ -901,7 +901,7 @@ ADDQ $8*SIZE, ptrbb; | |||
| MUL_DY yvec0, yvec5, yvec7; | |||
| ADD_DY yvec9, yvec7, yvec9; | |||
| #### Untoll time 2 #### | |||
| //#### Untoll time 2 #### | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| MUL_DY yvec1, yvec2, yvec6; | |||
| ADD_DY yvec15, yvec6, yvec15; | |||
| @@ -927,7 +927,7 @@ TEST $1, %rax; | |||
| JLE .L8_loopE; | |||
| ALIGN_5 | |||
| .L8_bodyB:; | |||
| #### Untoll time 1 #### | |||
| //#### Untoll time 1 #### | |||
| MUL_DY yvec0, yvec2, yvec6; | |||
| ADD_DY yvec15, yvec6, yvec15; | |||
| SHUF_DY $0x03, yvec2, yvec2, yvec4; | |||
| @@ -943,27 +943,27 @@ MUL_DY yvec0, yvec5, yvec7; | |||
| ADD_DY yvec9, yvec7, yvec9; | |||
| .L8_loopE:; | |||
| #### Load Alpha #### | |||
| //#### Load Alpha #### | |||
| BROAD_DY MEMALPHA, yvec7; | |||
| #### Multiply Alpha #### | |||
| //#### Multiply Alpha #### | |||
| MUL_DY yvec7,yvec15,yvec15; | |||
| MUL_DY yvec7,yvec13,yvec13; | |||
| MUL_DY yvec7,yvec11,yvec11; | |||
| MUL_DY yvec7,yvec9,yvec9; | |||
| #### Reverse the Results #### | |||
| //#### Reverse the Results #### | |||
| MOV_DY yvec15, yvec7; | |||
| REVS_DY $0x0a,yvec13,yvec15,yvec15; | |||
| REVS_DY $0x0a,yvec7,yvec13,yvec13; | |||
| MOV_DY yvec11,yvec7; | |||
| REVS_DY $0x0a,yvec9,yvec11,yvec11; | |||
| REVS_DY $0x0a,yvec7,yvec9,yvec9; | |||
| #### Testing alignment #### | |||
| //#### Testing alignment #### | |||
| MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L8_loopEx; # Unalign part write back | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| //#### Writing Back #### | |||
| EXTRA_DY $1,yvec15,xvec7; | |||
| EXTRA_DY $1,yvec13,xvec5; | |||
| EXTRA_DY $1,yvec11,xvec3; | |||
| @@ -1076,7 +1076,7 @@ LEAQ (, %rax, SIZE), %rax; | |||
| LEAQ (ptrba, %rax, 2), ptrba; | |||
| LEAQ (ptrbb, %rax, 4), ptrbb | |||
| #endif | |||
| #### Initial Results Register #### | |||
| //#### Initial Results Register #### | |||
| LD_DX 0*SIZE(ptrbb), xvec2; | |||
| XOR_DY yvec15, yvec15, yvec15; | |||
| LD_DX 2*SIZE(ptrbb), xvec3; | |||
| @@ -1106,7 +1106,7 @@ ALIGN_5; | |||
| .L10_bodyB:; | |||
| # Computing kernel | |||
| ##### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DX 4*SIZE(ptrbb), xvec6; | |||
| SHUF_DX $0x4e, xvec3, xvec5; | |||
| MUL_DX xvec0, xvec2, xvec2; | |||
| @@ -1123,7 +1123,7 @@ SHUF_DX $0x4e, xvec6, xvec4; | |||
| MUL_DX xvec0, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec9, xvec9; | |||
| #### Unroll time 2 #### | |||
| //#### Unroll time 2 #### | |||
| LD_DX 8*SIZE(ptrbb), xvec2; | |||
| SHUF_DX $0x4e, xvec7, xvec5; | |||
| MUL_DX xvec1, xvec6, xvec6; | |||
| @@ -1140,7 +1140,7 @@ SHUF_DX $0x4e, xvec2, xvec4; | |||
| MUL_DX xvec1, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec9, xvec9; | |||
| ##### Unroll time 3 #### | |||
| //#### Unroll time 3 #### | |||
| LD_DX 12*SIZE(ptrbb), xvec6; | |||
| SHUF_DX $0x4e, xvec3, xvec5; | |||
| MUL_DX xvec0, xvec2, xvec2; | |||
| @@ -1159,7 +1159,7 @@ ADDQ $8*SIZE, ptrba; | |||
| MUL_DX xvec0, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec9, xvec9; | |||
| #### Unroll time 4 #### | |||
| //#### Unroll time 4 #### | |||
| LD_DX 0*SIZE(ptrbb), xvec2; | |||
| SHUF_DX $0x4e, xvec7, xvec5; | |||
| MUL_DX xvec1, xvec6, xvec6; | |||
| @@ -1188,7 +1188,7 @@ TEST $2, %rax; | |||
| JLE .L11_loopE; | |||
| ALIGN_5 | |||
| .L11_bodyB:; | |||
| ##### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DX 4*SIZE(ptrbb), xvec6; | |||
| SHUF_DX $0x4e, xvec3, xvec5; | |||
| MUL_DX xvec0, xvec2, xvec2; | |||
| @@ -1208,7 +1208,7 @@ ADDQ $4*SIZE, ptrba; | |||
| MUL_DX xvec0, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec9, xvec9; | |||
| #### Unroll time 2 #### | |||
| //#### Unroll time 2 #### | |||
| LD_DX 0*SIZE(ptrbb), xvec2; | |||
| SHUF_DX $0x4e, xvec7, xvec5; | |||
| MUL_DX xvec1, xvec6, xvec6; | |||
| @@ -1251,27 +1251,27 @@ MUL_DX xvec0, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec9, xvec9; | |||
| .L12_loopE:; | |||
| #### Load Alpha #### | |||
| //#### Load Alpha #### | |||
| BROAD_DX MEMALPHA, xvec7; | |||
| #### Multiply Alpha #### | |||
| //#### Multiply Alpha #### | |||
| MUL_DX xvec7, xvec15, xvec15; | |||
| MUL_DX xvec7, xvec13, xvec13; | |||
| MUL_DX xvec7, xvec11, xvec11; | |||
| MUL_DX xvec7, xvec9, xvec9; | |||
| #### Reverse the Results #### | |||
| //#### Reverse the Results #### | |||
| MOV_DX xvec15, xvec6; | |||
| REVS_DX xvec13, xvec15, xvec15; | |||
| REVS_DX xvec6, xvec13, xvec13; | |||
| MOV_DX xvec11, xvec6; | |||
| REVS_DX xvec9, xvec11, xvec11; | |||
| REVS_DX xvec6, xvec9, xvec9; | |||
| #### Testing Alignment #### | |||
| //#### Testing Alignment #### | |||
| MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L12_loopEx; | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| //#### Writing Back #### | |||
| #ifndef TRMMKERNEL | |||
| ADD_DX 0*SIZE(C0), xvec13, xvec13; | |||
| ADD_DX 0*SIZE(C0, ldc, 1), xvec15, xvec15; | |||
| @@ -1345,7 +1345,7 @@ LEAQ (,%rax, SIZE), %rax; | |||
| ADDQ %rax, ptrba; | |||
| LEAQ (ptrbb, %rax, 4), ptrbb; | |||
| #endif | |||
| #### Initial Results Register #### | |||
| //#### Initial Results Register #### | |||
| XOR_DY yvec15, yvec15, yvec15; | |||
| #ifndef TRMMKERNEL | |||
| MOVQ bk, k; | |||
| @@ -1429,11 +1429,11 @@ ADDQ $1*SIZE, ptrba; | |||
| ADDQ $4*SIZE, ptrbb; | |||
| .L16_loopE: | |||
| #### Load Alpha #### | |||
| //#### Load Alpha #### | |||
| BROAD_DY MEMALPHA, yvec7; | |||
| #### Multiply Alpha #### | |||
| //#### Multiply Alpha #### | |||
| MUL_DY yvec15, yvec7, yvec15; | |||
| #### Writing Back #### | |||
| //#### Writing Back #### | |||
| EXTRA_DY $1, yvec15, xvec7; | |||
| #ifndef TRMMKERNEL | |||
| LDL_DX 0*SIZE(C0), xvec0, xvec0; | |||
| @@ -1497,7 +1497,7 @@ LEAQ (, %rax, SIZE), %rax; | |||
| LEAQ (ptrba, %rax, 8), ptrba; | |||
| LEAQ (ptrbb, %rax, 2), ptrbb; | |||
| #endif | |||
| #### Initial Results Register #### | |||
| //#### Initial Results Register #### | |||
| XOR_DY yvec15, yvec15, yvec15; | |||
| XOR_DY yvec14, yvec14, yvec14; | |||
| XOR_DY yvec13, yvec13, yvec13; | |||
| @@ -1526,7 +1526,7 @@ JLE .L211_loopE; | |||
| ALIGN_5; | |||
| .L211_bodyB: | |||
| # Computing kernel | |||
| #### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| LD_DX 0*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -1563,7 +1563,7 @@ ADD_DX xvec6, xvec9, xvec9; | |||
| MUL_DX xvec3, xvec7, xvec7; | |||
| ADD_DX xvec7, xvec8, xvec8; | |||
| #### Unroll time 2 #### | |||
| //#### Unroll time 2 #### | |||
| LD_DX 8*SIZE(ptrba), xvec0; | |||
| LD_DX 2*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -1600,7 +1600,7 @@ ADD_DX xvec6, xvec9, xvec9; | |||
| MUL_DX xvec3, xvec7, xvec7; | |||
| ADD_DX xvec7, xvec8, xvec8; | |||
| #### Unroll time 3 #### | |||
| //#### Unroll time 3 #### | |||
| LD_DX 16*SIZE(ptrba), xvec0; | |||
| LD_DX 4*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -1637,7 +1637,7 @@ ADD_DX xvec6, xvec9, xvec9; | |||
| MUL_DX xvec3, xvec7, xvec7; | |||
| ADD_DX xvec7, xvec8, xvec8; | |||
| #### Unroll time 4 #### | |||
| //#### Unroll time 4 #### | |||
| LD_DX 24*SIZE(ptrba), xvec0; | |||
| LD_DX 6*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -1689,7 +1689,7 @@ JLE .L212_loopE; | |||
| ALIGN_5; | |||
| .L212_bodyB: | |||
| # Computing kernel | |||
| #### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| LD_DX 0*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -1726,7 +1726,7 @@ ADD_DX xvec6, xvec9, xvec9; | |||
| MUL_DX xvec3, xvec7, xvec7; | |||
| ADD_DX xvec7, xvec8, xvec8; | |||
| #### Unroll time 2 #### | |||
| //#### Unroll time 2 #### | |||
| LD_DX 8*SIZE(ptrba), xvec0; | |||
| LD_DX 2*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -1775,7 +1775,7 @@ TEST $1, %rax; | |||
| JLE .L213_loopE; | |||
| ALIGN_5 | |||
| .L213_bodyB: | |||
| #### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| LD_DX 0*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -1815,7 +1815,7 @@ MUL_DX xvec3, xvec7, xvec7; | |||
| ADD_DX xvec7, xvec8, xvec8; | |||
| .L213_loopE: | |||
| #### Multiply Alpha #### | |||
| //#### Multiply Alpha #### | |||
| BROAD_DX MEMALPHA, xvec7; | |||
| MUL_DX xvec7, xvec15, xvec15; | |||
| MUL_DX xvec7, xvec14, xvec14; | |||
| @@ -1825,7 +1825,7 @@ MUL_DX xvec7, xvec11, xvec11; | |||
| MUL_DX xvec7, xvec10, xvec10; | |||
| MUL_DX xvec7, xvec9, xvec9; | |||
| MUL_DX xvec7, xvec8, xvec8; | |||
| #### Reverse ##### | |||
| //#### Reverse #### | |||
| MOV_DX xvec15, xvec6; | |||
| REVS_DX xvec11, xvec15, xvec15; | |||
| REVS_DX xvec6, xvec11, xvec11; | |||
| @@ -1838,13 +1838,13 @@ REVS_DX xvec6, xvec9, xvec9; | |||
| MOV_DX xvec12, xvec6; | |||
| REVS_DX xvec8, xvec12, xvec12; | |||
| REVS_DX xvec6, xvec8, xvec8; | |||
| #### Testing Alignment #### | |||
| //#### Testing Alignment #### | |||
| MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L213_loopEx; | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| //#### Writing Back #### | |||
| #ifndef TRMMKERNEL | |||
| ADD_DX 0*SIZE(C0), xvec11, xvec11; | |||
| ADD_DX 2*SIZE(C0), xvec10, xvec10; | |||
| @@ -1952,7 +1952,7 @@ LEAQ (,%rax, SIZE), %rax; | |||
| LEAQ (ptrba, %rax, 4), ptrba; | |||
| LEAQ (ptrbb, %rax, 2), ptrbb; | |||
| #endif | |||
| #### Initial Results Register #### | |||
| //#### Initial Results Register #### | |||
| XOR_DY yvec15, yvec15, yvec15; | |||
| XOR_DY yvec14, yvec14, yvec14; | |||
| XOR_DY yvec11, yvec11, yvec11; | |||
| @@ -1977,7 +1977,7 @@ JLE .L221_loopE; | |||
| ALIGN_5 | |||
| .L221_bodyB:; | |||
| # Computing kernel | |||
| #### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| LD_DX 0*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -1996,7 +1996,7 @@ ADD_DX xvec4, xvec11, xvec11; | |||
| MUL_DX xvec1, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec10, xvec10; | |||
| #### Unroll time 2 #### | |||
| //#### Unroll time 2 #### | |||
| LD_DX 4*SIZE(ptrba), xvec0; | |||
| LD_DX 2*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -2015,7 +2015,7 @@ ADD_DX xvec4, xvec11, xvec11; | |||
| MUL_DX xvec1, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec10, xvec10; | |||
| #### Unroll time 3 #### | |||
| //#### Unroll time 3 #### | |||
| LD_DX 8*SIZE(ptrba), xvec0; | |||
| LD_DX 4*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -2034,7 +2034,7 @@ ADD_DX xvec4, xvec11, xvec11; | |||
| MUL_DX xvec1, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec10, xvec10; | |||
| #### Unroll time 4 #### | |||
| //#### Unroll time 4 #### | |||
| LD_DX 12*SIZE(ptrba), xvec0; | |||
| LD_DX 6*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -2067,7 +2067,7 @@ TEST $2, %rax; | |||
| JLE .L222_loopE; | |||
| ALIGN_5 | |||
| .L222_bodyB: | |||
| #### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| LD_DX 0*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -2086,7 +2086,7 @@ ADD_DX xvec4, xvec11, xvec11; | |||
| MUL_DX xvec1, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec10, xvec10; | |||
| #### Unroll time 2 #### | |||
| //#### Unroll time 2 #### | |||
| LD_DX 4*SIZE(ptrba), xvec0; | |||
| LD_DX 2*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -2116,7 +2116,7 @@ TEST $1, %rax; | |||
| JLE .L223_loopE; | |||
| ALIGN_5 | |||
| .L223_bodyB: | |||
| #### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| LD_DX 0*SIZE(ptrbb), xvec4; | |||
| MOV_DX xvec4, xvec5; | |||
| @@ -2138,26 +2138,26 @@ MUL_DX xvec1, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec10, xvec10; | |||
| .L223_loopE: | |||
| #### Multiply Alpha #### | |||
| //#### Multiply Alpha #### | |||
| BROAD_DX MEMALPHA, xvec7; | |||
| MUL_DX xvec7, xvec15, xvec15; | |||
| MUL_DX xvec7, xvec14, xvec14; | |||
| MUL_DX xvec7, xvec11, xvec11; | |||
| MUL_DX xvec7, xvec10, xvec10; | |||
| #### Reverse ##### | |||
| //#### Reverse #### | |||
| MOV_DX xvec15, xvec6; | |||
| REVS_DX xvec11, xvec15, xvec15; | |||
| REVS_DX xvec6, xvec11, xvec11; | |||
| MOV_DX xvec14, xvec6; | |||
| REVS_DX xvec10, xvec14, xvec14; | |||
| REVS_DX xvec6, xvec10, xvec10; | |||
| #### Testing Alignment #### | |||
| //#### Testing Alignment #### | |||
| MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L223_loopEx; | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| //#### Writing Back #### | |||
| #ifndef TRMMKERNEL | |||
| ADD_DX 0*SIZE(C0), xvec11, xvec11; | |||
| ADD_DX 2*SIZE(C0), xvec10, xvec10; | |||
| @@ -2220,7 +2220,7 @@ ADDQ $4, kk | |||
| ADDQ $4*SIZE, C0; | |||
| ADDQ $4*SIZE, C1; | |||
| .L22_loopE:; | |||
| TEST $2, bm; # Rm = 2 | |||
| TEST $2, bm; // Rm = 2 | |||
| JLE .L23_loopE; | |||
| ALIGN_5; | |||
| .L23_bodyB: | |||
| @@ -2255,7 +2255,7 @@ JLE .L231_loopE; | |||
| ALIGN_5 | |||
| .L231_bodyB: | |||
| # Computing kernel | |||
| #### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| LD_DX 0*SIZE(ptrbb), xvec4; | |||
| SHUF_DX $0x4e, xvec4, xvec5; | |||
| @@ -2264,7 +2264,7 @@ ADD_DX xvec4, xvec15, xvec15; | |||
| MUL_DX xvec0, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec11, xvec11; | |||
| #### Unroll time 2 #### | |||
| //#### Unroll time 2 #### | |||
| LD_DX 2*SIZE(ptrba), xvec0; | |||
| LD_DX 2*SIZE(ptrbb), xvec4; | |||
| SHUF_DX $0x4e, xvec4, xvec5; | |||
| @@ -2273,7 +2273,7 @@ ADD_DX xvec4, xvec15, xvec15; | |||
| MUL_DX xvec0, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec11, xvec11; | |||
| #### Unroll time 3 #### | |||
| //#### Unroll time 3 #### | |||
| LD_DX 4*SIZE(ptrba), xvec0; | |||
| LD_DX 4*SIZE(ptrbb), xvec4; | |||
| SHUF_DX $0x4e, xvec4, xvec5; | |||
| @@ -2282,7 +2282,7 @@ ADD_DX xvec4, xvec15, xvec15; | |||
| MUL_DX xvec0, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec11, xvec11; | |||
| #### Unroll time 4 #### | |||
| //#### Unroll time 4 #### | |||
| LD_DX 6*SIZE(ptrba), xvec0; | |||
| LD_DX 6*SIZE(ptrbb), xvec4; | |||
| SHUF_DX $0x4e, xvec4, xvec5; | |||
| @@ -2305,7 +2305,7 @@ TEST $2, %rax; | |||
| JLE .L232_loopE; | |||
| ALIGN_5 | |||
| .L232_bodyB: | |||
| #### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| LD_DX 0*SIZE(ptrbb), xvec4; | |||
| SHUF_DX $0x4e, xvec4, xvec5; | |||
| @@ -2314,7 +2314,7 @@ ADD_DX xvec4, xvec15, xvec15; | |||
| MUL_DX xvec0, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec11, xvec11; | |||
| #### Unroll time 2 #### | |||
| //#### Unroll time 2 #### | |||
| LD_DX 2*SIZE(ptrba), xvec0; | |||
| LD_DX 2*SIZE(ptrbb), xvec4; | |||
| SHUF_DX $0x4e, xvec4, xvec5; | |||
| @@ -2334,7 +2334,7 @@ TEST $1, %rax; | |||
| JLE .L233_loopE; | |||
| ALIGN_5 | |||
| .L233_bodyB: | |||
| #### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| LD_DX 0*SIZE(ptrbb), xvec4; | |||
| SHUF_DX $0x4e, xvec4, xvec5; | |||
| @@ -2345,21 +2345,21 @@ MUL_DX xvec0, xvec5, xvec5; | |||
| ADD_DX xvec5, xvec11, xvec11; | |||
| ADDQ $2*SIZE, ptrbb; | |||
| .L233_loopE: | |||
| #### Multiply Alpha #### | |||
| //#### Multiply Alpha #### | |||
| BROAD_DX MEMALPHA, xvec7; | |||
| MUL_DX xvec7, xvec15, xvec15; | |||
| MUL_DX xvec7, xvec11, xvec11; | |||
| #### Reverse ##### | |||
| //#### Reverse #### | |||
| MOV_DX xvec15, xvec6; | |||
| REVS_DX xvec11, xvec15, xvec15; | |||
| REVS_DX xvec6, xvec11, xvec11; | |||
| #### Testing Alignment #### | |||
| //#### Testing Alignment #### | |||
| MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L233_loopEx; | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| //#### Writing Back #### | |||
| #ifndef TRMMKERNEL | |||
| ADD_DX 0*SIZE(C0), xvec11, xvec11; | |||
| ADD_DX 0*SIZE(C1), xvec15, xvec15; | |||
| @@ -2408,7 +2408,7 @@ ADDQ $2, kk; | |||
| ADDQ $2*SIZE, C0; | |||
| ADDQ $2*SIZE, C1; | |||
| .L23_loopE: | |||
| TEST $1, bm; # Rm = 1 | |||
| TEST $1, bm; // Rm = 1 | |||
| JLE .L24_loopE; | |||
| ALIGN_5; | |||
| .L24_bodyB: | |||
| @@ -2534,7 +2534,7 @@ SALQ $4, k; | |||
| ADDQ k, bb; | |||
| LEAQ (C, ldc, 2), C; | |||
| .L20_loopE:; | |||
| TEST $1, bn; # Rn = 1 | |||
| TEST $1, bn; // Rn = 1 | |||
| JLE .L30_loopE; | |||
| ALIGN_5 | |||
| .L30_bodyB: | |||
| @@ -2558,7 +2558,7 @@ LEAQ (, %rax, SIZE), %rax; | |||
| LEAQ (ptrba, %rax, 8), ptrba; | |||
| ADDQ %rax, ptrbb; | |||
| #endif | |||
| #### Initial Results Register #### | |||
| //#### Initial Results Register #### | |||
| XOR_DY yvec15, yvec15, yvec15; | |||
| XOR_DY yvec14, yvec14, yvec14; | |||
| #ifndef TRMMKERNEL | |||
| @@ -2580,7 +2580,7 @@ SARQ $2, k; | |||
| JLE .L311_loopE; | |||
| ALIGN_5 | |||
| .L311_bodyB: | |||
| #### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| LD_DY 4*SIZE(ptrba), yvec1; | |||
| BROAD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2589,7 +2589,7 @@ ADD_DY yvec0, yvec15, yvec15; | |||
| MUL_DY yvec2, yvec1, yvec1; | |||
| ADD_DY yvec1, yvec14, yvec14; | |||
| #### Unroll time 2 #### | |||
| //#### Unroll time 2 #### | |||
| LD_DY 8*SIZE(ptrba), yvec3; | |||
| LD_DY 12*SIZE(ptrba), yvec4; | |||
| BROAD_DY 1*SIZE(ptrbb), yvec5; | |||
| @@ -2598,7 +2598,7 @@ ADD_DY yvec3, yvec15, yvec15; | |||
| MUL_DY yvec5, yvec4, yvec4 | |||
| ADD_DY yvec4, yvec14, yvec14; | |||
| #### Unroll time 3 #### | |||
| //#### Unroll time 3 #### | |||
| LD_DY 16*SIZE(ptrba), yvec0; | |||
| LD_DY 20*SIZE(ptrba), yvec1; | |||
| BROAD_DY 2*SIZE(ptrbb), yvec2; | |||
| @@ -2607,7 +2607,7 @@ ADD_DY yvec0, yvec15, yvec15; | |||
| MUL_DY yvec2, yvec1, yvec1; | |||
| ADD_DY yvec1, yvec14, yvec14; | |||
| #### Unroll time 2 #### | |||
| //#### Unroll time 2 #### | |||
| LD_DY 24*SIZE(ptrba), yvec3; | |||
| LD_DY 28*SIZE(ptrba), yvec4; | |||
| BROAD_DY 3*SIZE(ptrbb), yvec5; | |||
| @@ -2630,7 +2630,7 @@ TEST $2, %rax; | |||
| JLE .L312_loopE; | |||
| ALIGN_5 | |||
| .L312_bodyB: | |||
| #### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| LD_DY 4*SIZE(ptrba), yvec1; | |||
| BROAD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2639,7 +2639,7 @@ ADD_DY yvec0, yvec15, yvec15; | |||
| MUL_DY yvec2, yvec1, yvec1; | |||
| ADD_DY yvec1, yvec14, yvec14; | |||
| #### Unroll time 2 #### | |||
| //#### Unroll time 2 #### | |||
| LD_DY 8*SIZE(ptrba), yvec3; | |||
| LD_DY 12*SIZE(ptrba), yvec4; | |||
| BROAD_DY 1*SIZE(ptrbb), yvec5; | |||
| @@ -2660,7 +2660,7 @@ TEST $1, %rax; | |||
| JLE .L313_loopE; | |||
| ALIGN_5 | |||
| .L313_bodyB: | |||
| #### Unroll time 1 #### | |||
| //#### Unroll time 1 #### | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| LD_DY 4*SIZE(ptrba), yvec1; | |||
| BROAD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2672,17 +2672,17 @@ ADD_DY yvec1, yvec14, yvec14; | |||
| ADDQ $1*SIZE, ptrbb; | |||
| .L313_loopE: | |||
| #### Multiply Alpha #### | |||
| //#### Multiply Alpha #### | |||
| BROAD_DY MEMALPHA, yvec7; | |||
| MUL_DY yvec7, yvec15, yvec15; | |||
| MUL_DY yvec7, yvec14, yvec14; | |||
| #### Testing Alignment #### | |||
| //#### Testing Alignment #### | |||
| MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L313_loopEx; | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| //#### Writing Back #### | |||
| EXTRA_DY $1, yvec15, xvec13; | |||
| EXTRA_DY $1, yvec14, xvec12; | |||
| #ifndef TRMMKERNEL | |||
| @@ -2762,7 +2762,7 @@ LEAQ (,%rax, SIZE), %rax; | |||
| LEAQ (ptrba, %rax, 4), ptrba; | |||
| ADDQ %rax, ptrbb; | |||
| #endif | |||
| #### Initial Results Register #### | |||
| //#### Initial Results Register #### | |||
| XOR_DY yvec15, yvec15, yvec15; | |||
| #ifndef TRMMKERNEL | |||
| MOVQ bk, k; | |||
| @@ -2847,16 +2847,16 @@ ADDQ $4*SIZE, ptrba; | |||
| ADDQ $1*SIZE, ptrbb; | |||
| .L323_loopE: | |||
| #### Multiply Alpha #### | |||
| //#### Multiply Alpha #### | |||
| BROAD_DY MEMALPHA, yvec7; | |||
| MUL_DY yvec7, yvec15, yvec15; | |||
| #### Testing Alignment #### | |||
| //#### Testing Alignment #### | |||
| MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L323_loopEx; | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| //#### Writing Back #### | |||
| EXTRA_DY $1, yvec15, xvec14; | |||
| #ifndef TRMMKERNEL | |||
| ADD_DX 0*SIZE(C0), xvec15, xvec15; | |||
| @@ -2878,7 +2878,7 @@ ADDQ $4*SIZE, C0; | |||
| JMP .L32_loopE; | |||
| ALIGN_5 | |||
| .L323_loopEx: | |||
| #### Writing Back #### | |||
| //#### Writing Back #### | |||
| EXTRA_DY $1, yvec15, xvec14; | |||
| #ifndef TRMMKERNEL | |||
| LDL_DX 0*SIZE(C0), xvec13, xvec13; | |||
| @@ -2917,7 +2917,7 @@ LEAQ (, %rax, SIZE), %rax | |||
| LEAQ (ptrba, %rax, 2), ptrba | |||
| ADDQ %rax, ptrbb; | |||
| #endif | |||
| #### Initial Result #### | |||
| //#### Initial Result #### | |||
| XOR_DY yvec15, yvec15, yvec15; | |||
| #ifndef TRMMKERNEL | |||
| MOVQ bk, k; | |||
| @@ -3000,7 +3000,7 @@ ADD_DX xvec2, xvec15, xvec15; | |||
| ADDQ $2*SIZE, ptrba; | |||
| ADDQ $1*SIZE, ptrbb; | |||
| .L333_loopE: | |||
| #### Multiply Alpha #### | |||
| //#### Multiply Alpha #### | |||
| BROAD_DX MEMALPHA, xvec7; | |||
| MUL_DX xvec7, xvec15, xvec15; | |||
| #ifndef TRMMKERNEL | |||
| @@ -3119,7 +3119,7 @@ addq $1*SIZE, ptrba; | |||
| addq $1*SIZE, ptrbb; | |||
| .L343_loopE: | |||
| #### Writing Back #### | |||
| //#### Writing Back #### | |||
| vmovsd MEMALPHA, xvec7; | |||
| vmulsd xvec7, xvec15, xvec15; | |||
| #ifndef TRMMKERNEL | |||