| @@ -148,74 +148,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #undef MOVQ | #undef MOVQ | ||||
| #define MOVQ movq | #define MOVQ movq | ||||
| #define XOR_SY vxorps | |||||
| #define XOR_DY vxorpd | #define XOR_DY vxorpd | ||||
| #define XOR_SX xorps | |||||
| #define XOR_DX xorpd | |||||
| #define XOR_DX vxorpd | |||||
| #define LD_SY vmovaps | |||||
| #define LD_DY vmovapd | #define LD_DY vmovapd | ||||
| #define LD_SX movaps | |||||
| #define LD_DX movapd | |||||
| #define LD_DX vmovapd | |||||
| #define LDL_DY vmovlpd | #define LDL_DY vmovlpd | ||||
| #define LDL_DX movlpd | |||||
| #define LDL_DX vmovlpd | |||||
| #define LDH_DY vmovhpd | #define LDH_DY vmovhpd | ||||
| #define LDH_DX movhpd | |||||
| #define LDH_DX vmovhpd | |||||
| #define ST_SY vmovaps | |||||
| #define ST_DY vmovapd | #define ST_DY vmovapd | ||||
| #define ST_SX movaps | |||||
| #define ST_DX movapd | |||||
| #define ST_DX vmovapd | |||||
| #define STL_DY vmovlpd | #define STL_DY vmovlpd | ||||
| #define STL_DX movlpd | |||||
| #define STL_DX vmovlpd | |||||
| #define STH_DY vmovhpd | #define STH_DY vmovhpd | ||||
| #define STH_DX movhpd | |||||
| #define STH_DX vmovhpd | |||||
| #define EDUP_SY vmovsldup | |||||
| #define ODUP_SY vmovshdup | |||||
| #define EDUP_SX movsldup | |||||
| #define ODUP_SX movshdup | |||||
| #define EDUP_DY vmovddup | #define EDUP_DY vmovddup | ||||
| #define ADD_SY vaddps | |||||
| #define ADD_DY vaddpd | #define ADD_DY vaddpd | ||||
| #define ADD_SX addps | |||||
| #define ADD_DX addpd | |||||
| #define ADD_DX vaddpd | |||||
| #define SUB_DY vsubpd | #define SUB_DY vsubpd | ||||
| #define SUB_DX subpd | |||||
| #define SUB_DX vsubpd | |||||
| #define ADDSUB_DY vaddsubpd | #define ADDSUB_DY vaddsubpd | ||||
| #define ADDSUB_DX addsubpd | |||||
| #define ADDSUB_SY vaddsubps | |||||
| #define ADDSUB_DX vaddsubpd | |||||
| #define MUL_SY vmulps | |||||
| #define MUL_DY vmulpd | #define MUL_DY vmulpd | ||||
| #define MUL_SX mulps | |||||
| #define MUL_DX mulpd | |||||
| #define MUL_DX vmulpd | |||||
| #define SHUF_SY vperm2f128 | |||||
| #define SHUF_DY vperm2f128 | #define SHUF_DY vperm2f128 | ||||
| #define SHUF_DX pshufd | |||||
| #define SHUF_SX pshufd | |||||
| #define SHUF_DX vpshufd | |||||
| #define VPERMILP_SY vpermilps | |||||
| #define VPERMILP_SX vpermilps | |||||
| #define VPERMILP_DY vpermilpd | #define VPERMILP_DY vpermilpd | ||||
| #define BROAD_SY vbroadcastss | |||||
| #define BROAD_DY vbroadcastsd | #define BROAD_DY vbroadcastsd | ||||
| #define BROAD_SX vbroadcastss | |||||
| #define BROAD_DX movddup | |||||
| #define BROAD_DX vmovddup | |||||
| #define MOV_SY vmovaps | |||||
| #define MOV_DY vmovapd | #define MOV_DY vmovapd | ||||
| #define MOV_SX movaps | |||||
| #define MOV_DX movapd | |||||
| #define MOV_DX vmovapd | |||||
| #define REVS_SY vshufps | |||||
| #define REVS_DY vshufpd | #define REVS_DY vshufpd | ||||
| #define REVS_SX shufps | |||||
| #define REVS_DX movsd | |||||
| #define REVS_DX vmovsd | |||||
| #define EXTRA_DY vextractf128 | #define EXTRA_DY vextractf128 | ||||
| @@ -282,6 +257,8 @@ movq old_offset, %r11; | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| vzeroupper | |||||
| vmovlps %xmm0, MEMALPHA_R | vmovlps %xmm0, MEMALPHA_R | ||||
| vmovlps %xmm1, MEMALPHA_I | vmovlps %xmm1, MEMALPHA_I | ||||
| movq old_bm, bm | movq old_bm, bm | ||||
| @@ -1373,14 +1350,14 @@ EXTRA_DY $1, yvec14, xvec6; | |||||
| EXTRA_DY $1, yvec13, xvec5; | EXTRA_DY $1, yvec13, xvec5; | ||||
| EXTRA_DY $1, yvec12, xvec4; | EXTRA_DY $1, yvec12, xvec4; | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| ADD_DX 0*SIZE(C0), xvec15; | |||||
| ADD_DX 2*SIZE(C0, ldc, 1), xvec7; | |||||
| ADD_DX 0*SIZE(C0, ldc, 1), xvec13; | |||||
| ADD_DX 2*SIZE(C0), xvec5; | |||||
| ADD_DX 0*SIZE(C1), xvec14; | |||||
| ADD_DX 2*SIZE(C1, ldc, 1), xvec6; | |||||
| ADD_DX 0*SIZE(C1, ldc, 1), xvec12; | |||||
| ADD_DX 2*SIZE(C1), xvec4; | |||||
| ADD_DX 0*SIZE(C0), xvec15, xvec15; | |||||
| ADD_DX 2*SIZE(C0, ldc, 1), xvec7, xvec7; | |||||
| ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13; | |||||
| ADD_DX 2*SIZE(C0), xvec5, xvec5; | |||||
| ADD_DX 0*SIZE(C1), xvec14, xvec14; | |||||
| ADD_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6; | |||||
| ADD_DX 0*SIZE(C1, ldc, 1), xvec12, xvec12; | |||||
| ADD_DX 2*SIZE(C1), xvec4, xvec4; | |||||
| #endif | #endif | ||||
| ST_DX xvec15, 0*SIZE(C0); | ST_DX xvec15, 0*SIZE(C0); | ||||
| ST_DX xvec7, 2*SIZE(C0, ldc, 1); | ST_DX xvec7, 2*SIZE(C0, ldc, 1); | ||||
| @@ -1410,18 +1387,18 @@ EXTRA_DY $1, yvec14, xvec6; | |||||
| EXTRA_DY $1, yvec13, xvec5; | EXTRA_DY $1, yvec13, xvec5; | ||||
| EXTRA_DY $2, yvec12, xvec4; | EXTRA_DY $2, yvec12, xvec4; | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| LDL_DX 0*SIZE(C0), xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0; | |||||
| LDL_DX 2*SIZE(C0, ldc, 1), xvec1; | |||||
| LDH_DX 3*SIZE(C0, ldc, 1), xvec1; | |||||
| LDL_DX 0*SIZE(C0, ldc, 1), xvec2; | |||||
| LDH_DX 1*SIZE(C0, ldc, 1), xvec2; | |||||
| LDL_DX 2*SIZE(C0), xvec3; | |||||
| LDH_DX 3*SIZE(C0), xvec3; | |||||
| ADD_DX xvec0, xvec15; | |||||
| ADD_DX xvec1, xvec7; | |||||
| ADD_DX xvec2, xvec13; | |||||
| ADD_DX xvec3, xvec5; | |||||
| LDL_DX 0*SIZE(C0), xvec0, xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0, xvec0; | |||||
| LDL_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1; | |||||
| LDH_DX 3*SIZE(C0, ldc, 1), xvec1, xvec1; | |||||
| LDL_DX 0*SIZE(C0, ldc, 1), xvec2, xvec2; | |||||
| LDH_DX 1*SIZE(C0, ldc, 1), xvec2, xvec2; | |||||
| LDL_DX 2*SIZE(C0), xvec3, xvec3; | |||||
| LDH_DX 3*SIZE(C0), xvec3, xvec3; | |||||
| ADD_DX xvec0, xvec15, xvec15; | |||||
| ADD_DX xvec1, xvec7, xvec7; | |||||
| ADD_DX xvec2, xvec13, xvec13; | |||||
| ADD_DX xvec3, xvec5, xvec5; | |||||
| #endif | #endif | ||||
| STL_DX xvec15, 0*SIZE(C0); | STL_DX xvec15, 0*SIZE(C0); | ||||
| STH_DX xvec15, 1*SIZE(C0); | STH_DX xvec15, 1*SIZE(C0); | ||||
| @@ -1432,18 +1409,18 @@ STH_DX xvec13, 1*SIZE(C0, ldc, 1); | |||||
| STL_DX xvec6, 2*SIZE(C0); | STL_DX xvec6, 2*SIZE(C0); | ||||
| STH_DX xvec6, 3*SIZE(C0); | STH_DX xvec6, 3*SIZE(C0); | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| LDL_DX 0*SIZE(C1), xvec0; | |||||
| LDH_DX 1*SIZE(C1), xvec0; | |||||
| LDL_DX 2*SIZE(C1, ldc, 1), xvec1; | |||||
| LDH_DX 3*SIZE(C1, ldc, 1), xvec1; | |||||
| LDL_DX 0*SIZE(C1, ldc, 1), xvec2; | |||||
| LDH_DX 1*SIZE(C1, ldc, 1), xvec2; | |||||
| LDL_DX 2*SIZE(C1), xvec3; | |||||
| LDH_DX 3*SIZE(C1), xvec3; | |||||
| ADD_DX xvec0, xvec14; | |||||
| ADD_DX xvec1, xvec6; | |||||
| ADD_DX xvec2, xvec12; | |||||
| ADD_DX xvec3, xvec4; | |||||
| LDL_DX 0*SIZE(C1), xvec0, xvec0; | |||||
| LDH_DX 1*SIZE(C1), xvec0, xvec0; | |||||
| LDL_DX 2*SIZE(C1, ldc, 1), xvec1, xvec1; | |||||
| LDH_DX 3*SIZE(C1, ldc, 1), xvec1, xvec1; | |||||
| LDL_DX 0*SIZE(C1, ldc, 1), xvec2, xvec2; | |||||
| LDH_DX 1*SIZE(C1, ldc, 1), xvec2, xvec2; | |||||
| LDL_DX 2*SIZE(C1), xvec3, xvec3; | |||||
| LDH_DX 3*SIZE(C1), xvec3, xvec3; | |||||
| ADD_DX xvec0, xvec14, xvec14; | |||||
| ADD_DX xvec1, xvec6, xvec6; | |||||
| ADD_DX xvec2, xvec12, xvec12; | |||||
| ADD_DX xvec3, xvec4, xvec4; | |||||
| #endif | #endif | ||||
| STL_DX xvec14, 0*SIZE(C1); | STL_DX xvec14, 0*SIZE(C1); | ||||
| STH_DX xvec14, 1*SIZE(C1); | STH_DX xvec14, 1*SIZE(C1); | ||||
| @@ -1680,18 +1657,18 @@ ADD2_DY yvec4, yvec14, yvec14; | |||||
| EXTRA_DY $1, yvec15, xvec7; | EXTRA_DY $1, yvec15, xvec7; | ||||
| EXTRA_DY $1, yvec14, xvec6; | EXTRA_DY $1, yvec14, xvec6; | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| LDL_DX 0*SIZE(C0), xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0; | |||||
| LDL_DX 0*SIZE(C0, ldc, 1), xvec1; | |||||
| LDH_DX 1*SIZE(C0, ldc, 1), xvec1; | |||||
| LDL_DX 0*SIZE(C1), xvec2; | |||||
| LDH_DX 1*SIZE(C1), xvec2; | |||||
| LDL_DX 0*SIZE(C1, ldc, 1), xvec3; | |||||
| LDH_DX 1*SIZE(C1, ldc, 1), xvec3; | |||||
| ADD_DX xvec0, xvec15; | |||||
| ADD_DX xvec1, xvec7; | |||||
| ADD_DX xvec2, xvec14; | |||||
| ADD_DX xvec3, xvec6; | |||||
| LDL_DX 0*SIZE(C0), xvec0, xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0, xvec0; | |||||
| LDL_DX 0*SIZE(C0, ldc, 1), xvec1, xvec1; | |||||
| LDH_DX 1*SIZE(C0, ldc, 1), xvec1, xvec1; | |||||
| LDL_DX 0*SIZE(C1), xvec2, xvec2; | |||||
| LDH_DX 1*SIZE(C1), xvec2, xvec2; | |||||
| LDL_DX 0*SIZE(C1, ldc, 1), xvec3, xvec3; | |||||
| LDH_DX 1*SIZE(C1, ldc, 1), xvec3, xvec3; | |||||
| ADD_DX xvec0, xvec15, xvec15; | |||||
| ADD_DX xvec1, xvec7, xvec7; | |||||
| ADD_DX xvec2, xvec14, xvec14; | |||||
| ADD_DX xvec3, xvec6, xvec6; | |||||
| #endif | #endif | ||||
| STL_DX xvec15, 0*SIZE(C0); | STL_DX xvec15, 0*SIZE(C0); | ||||
| STH_DX xvec15, 1*SIZE(C0); | STH_DX xvec15, 1*SIZE(C0); | ||||
| @@ -2063,14 +2040,14 @@ JNE .L213_loopEx; | |||||
| ALIGN_5 | ALIGN_5 | ||||
| #### Writing back #### | #### Writing back #### | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| ADD_DX 0*SIZE(C0),xvec15; | |||||
| ADD_DX 2*SIZE(C1),xvec7; | |||||
| ADD_DX 4*SIZE(C0),xvec14; | |||||
| ADD_DX 6*SIZE(C1),xvec6; | |||||
| ADD_DX 0*SIZE(C1),xvec13; | |||||
| ADD_DX 2*SIZE(C0),xvec5; | |||||
| ADD_DX 4*SIZE(C1),xvec12; | |||||
| ADD_DX 6*SIZE(C0),xvec4; | |||||
| ADD_DX 0*SIZE(C0), xvec15, xvec15; | |||||
| ADD_DX 2*SIZE(C1), xvec7, xvec7; | |||||
| ADD_DX 4*SIZE(C0), xvec14, xvec14; | |||||
| ADD_DX 6*SIZE(C1), xvec6, xvec6; | |||||
| ADD_DX 0*SIZE(C1), xvec13, xvec13; | |||||
| ADD_DX 2*SIZE(C0), xvec5, xvec5; | |||||
| ADD_DX 4*SIZE(C1), xvec12, xvec12; | |||||
| ADD_DX 6*SIZE(C0), xvec4, xvec4; | |||||
| #endif | #endif | ||||
| ST_DX xvec15,0*SIZE(C0); | ST_DX xvec15,0*SIZE(C0); | ||||
| ST_DX xvec7,2*SIZE(C1); | ST_DX xvec7,2*SIZE(C1); | ||||
| @@ -2098,18 +2075,18 @@ JMP .L21_loopE; | |||||
| ALIGN_5 | ALIGN_5 | ||||
| .L213_loopEx: | .L213_loopEx: | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| LDL_DX 0*SIZE(C0), xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0; | |||||
| LDL_DX 2*SIZE(C1), xvec1; | |||||
| LDH_DX 3*SIZE(C1), xvec1; | |||||
| LDL_DX 4*SIZE(C0), xvec2; | |||||
| LDH_DX 5*SIZE(C0), xvec2; | |||||
| LDL_DX 6*SIZE(C1), xvec3; | |||||
| LDH_DX 7*SIZE(C1), xvec3; | |||||
| ADD_DX xvec0, xvec15; | |||||
| ADD_DX xvec1, xvec7; | |||||
| ADD_DX xvec2, xvec14; | |||||
| ADD_DX xvec3, xvec6; | |||||
| LDL_DX 0*SIZE(C0), xvec0, xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0, xvec0; | |||||
| LDL_DX 2*SIZE(C1), xvec1, xvec1; | |||||
| LDH_DX 3*SIZE(C1), xvec1, xvec1; | |||||
| LDL_DX 4*SIZE(C0), xvec2, xvec2; | |||||
| LDH_DX 5*SIZE(C0), xvec2, xvec2; | |||||
| LDL_DX 6*SIZE(C1), xvec3, xvec3; | |||||
| LDH_DX 7*SIZE(C1), xvec3, xvec3; | |||||
| ADD_DX xvec0, xvec15, xvec15; | |||||
| ADD_DX xvec1, xvec7, xvec7; | |||||
| ADD_DX xvec2, xvec14, xvec14; | |||||
| ADD_DX xvec3, xvec6, xvec6; | |||||
| #endif | #endif | ||||
| STL_DX xvec15, 0*SIZE(C0); | STL_DX xvec15, 0*SIZE(C0); | ||||
| STH_DX xvec15, 1*SIZE(C0); | STH_DX xvec15, 1*SIZE(C0); | ||||
| @@ -2120,18 +2097,18 @@ STH_DX xvec14, 5*SIZE(C0); | |||||
| STL_DX xvec6, 6*SIZE(C1); | STL_DX xvec6, 6*SIZE(C1); | ||||
| STH_DX xvec6, 7*SIZE(C1); | STH_DX xvec6, 7*SIZE(C1); | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| LDL_DX 0*SIZE(C1), xvec3; | |||||
| LDH_DX 1*SIZE(C1), xvec3; | |||||
| LDL_DX 2*SIZE(C0), xvec2; | |||||
| LDH_DX 3*SIZE(C0), xvec2; | |||||
| LDL_DX 4*SIZE(C1), xvec1; | |||||
| LDH_DX 5*SIZE(C1), xvec1; | |||||
| LDL_DX 6*SIZE(C0), xvec0; | |||||
| LDH_DX 7*SIZE(C0), xvec0; | |||||
| ADD_DX xvec3, xvec13; | |||||
| ADD_DX xvec2, xvec5; | |||||
| ADD_DX xvec1, xvec12; | |||||
| ADD_DX xvec0, xvec4; | |||||
| LDL_DX 0*SIZE(C1), xvec3, xvec3; | |||||
| LDH_DX 1*SIZE(C1), xvec3, xvec3; | |||||
| LDL_DX 2*SIZE(C0), xvec2, xvec2; | |||||
| LDH_DX 3*SIZE(C0), xvec2, xvec2; | |||||
| LDL_DX 4*SIZE(C1), xvec1, xvec1; | |||||
| LDH_DX 5*SIZE(C1), xvec1, xvec1; | |||||
| LDL_DX 6*SIZE(C0), xvec0, xvec0; | |||||
| LDH_DX 7*SIZE(C0), xvec0, xvec0; | |||||
| ADD_DX xvec3, xvec13, xvec13; | |||||
| ADD_DX xvec2, xvec5, xvec5; | |||||
| ADD_DX xvec1, xvec12, xvec12; | |||||
| ADD_DX xvec0, xvec4, xvec4; | |||||
| #endif | #endif | ||||
| STL_DX xvec13, 0*SIZE(C1); | STL_DX xvec13, 0*SIZE(C1); | ||||
| STH_DX xvec13, 1*SIZE(C1); | STH_DX xvec13, 1*SIZE(C1); | ||||
| @@ -2384,18 +2361,18 @@ EXTRA_DY $1, yvec15, xvec7; | |||||
| EXTRA_DY $1, yvec13, xvec5; | EXTRA_DY $1, yvec13, xvec5; | ||||
| #### Write back #### | #### Write back #### | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| LDL_DX 0*SIZE(C0), xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0; | |||||
| LDL_DX 2*SIZE(C1), xvec1; | |||||
| LDH_DX 3*SIZE(C1), xvec1; | |||||
| LDL_DX 0*SIZE(C1), xvec2; | |||||
| LDH_DX 1*SIZE(C1), xvec2; | |||||
| LDL_DX 2*SIZE(C0), xvec3; | |||||
| LDH_DX 3*SIZE(C0), xvec3; | |||||
| ADD_DX xvec0, xvec15; | |||||
| ADD_DX xvec1, xvec7; | |||||
| ADD_DX xvec2, xvec13; | |||||
| ADD_DX xvec3, xvec5; | |||||
| LDL_DX 0*SIZE(C0), xvec0, xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0, xvec0; | |||||
| LDL_DX 2*SIZE(C1), xvec1, xvec1; | |||||
| LDH_DX 3*SIZE(C1), xvec1, xvec1; | |||||
| LDL_DX 0*SIZE(C1), xvec2, xvec2; | |||||
| LDH_DX 1*SIZE(C1), xvec2, xvec2; | |||||
| LDL_DX 2*SIZE(C0), xvec3, xvec3; | |||||
| LDH_DX 3*SIZE(C0), xvec3, xvec3; | |||||
| ADD_DX xvec0, xvec15, xvec15; | |||||
| ADD_DX xvec1, xvec7, xvec7; | |||||
| ADD_DX xvec2, xvec13, xvec13; | |||||
| ADD_DX xvec3, xvec5, xvec5; | |||||
| #endif | #endif | ||||
| STL_DX xvec15, 0*SIZE(C0); | STL_DX xvec15, 0*SIZE(C0); | ||||
| STH_DX xvec15, 1*SIZE(C0); | STH_DX xvec15, 1*SIZE(C0); | ||||
| @@ -2582,12 +2559,12 @@ ADD2_DY yvec5, yvec15, yvec15; | |||||
| EXTRA_DY $1, yvec15, xvec7; | EXTRA_DY $1, yvec15, xvec7; | ||||
| #### Writing Back #### | #### Writing Back #### | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| LDL_DX 0*SIZE(C0), xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0; | |||||
| LDL_DX 0*SIZE(C1), xvec1; | |||||
| LDH_DX 1*SIZE(C1), xvec1; | |||||
| ADD_DX xvec0, xvec15; | |||||
| ADD_DX xvec1, xvec7; | |||||
| LDL_DX 0*SIZE(C0), xvec0, xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0, xvec0; | |||||
| LDL_DX 0*SIZE(C1), xvec1, xvec1; | |||||
| LDH_DX 1*SIZE(C1), xvec1, xvec1; | |||||
| ADD_DX xvec0, xvec15, xvec15; | |||||
| ADD_DX xvec1, xvec7, xvec7; | |||||
| #endif | #endif | ||||
| STL_DX xvec15, 0*SIZE(C0); | STL_DX xvec15, 0*SIZE(C0); | ||||
| STH_DX xvec15, 1*SIZE(C0); | STH_DX xvec15, 1*SIZE(C0); | ||||
| @@ -2845,18 +2822,18 @@ EXTRA_DY $1, yvec15, xvec7; | |||||
| EXTRA_DY $1, yvec14, xvec6; | EXTRA_DY $1, yvec14, xvec6; | ||||
| #### Writing Back #### | #### Writing Back #### | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| LDL_DX 0*SIZE(C0), xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0; | |||||
| LDL_DX 2*SIZE(C0), xvec1; | |||||
| LDH_DX 3*SIZE(C0), xvec1; | |||||
| LDL_DX 4*SIZE(C0), xvec2; | |||||
| LDH_DX 5*SIZE(C0), xvec2; | |||||
| LDL_DX 6*SIZE(C0), xvec3; | |||||
| LDH_DX 7*SIZE(C0), xvec3; | |||||
| ADD_DX xvec0, xvec15; | |||||
| ADD_DX xvec1, xvec7; | |||||
| ADD_DX xvec2, xvec14; | |||||
| ADD_DX xvec3, xvec6; | |||||
| LDL_DX 0*SIZE(C0), xvec0, xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0, xvec0; | |||||
| LDL_DX 2*SIZE(C0), xvec1, xvec1; | |||||
| LDH_DX 3*SIZE(C0), xvec1, xvec1; | |||||
| LDL_DX 4*SIZE(C0), xvec2, xvec2; | |||||
| LDH_DX 5*SIZE(C0), xvec2, xvec2; | |||||
| LDL_DX 6*SIZE(C0), xvec3, xvec3; | |||||
| LDH_DX 7*SIZE(C0), xvec3, xvec3; | |||||
| ADD_DX xvec0, xvec15, xvec15; | |||||
| ADD_DX xvec1, xvec7, xvec7; | |||||
| ADD_DX xvec2, xvec14, xvec14; | |||||
| ADD_DX xvec3, xvec6, xvec6; | |||||
| #endif | #endif | ||||
| STL_DX xvec15, 0*SIZE(C0); | STL_DX xvec15, 0*SIZE(C0); | ||||
| STH_DX xvec15, 1*SIZE(C0); | STH_DX xvec15, 1*SIZE(C0); | ||||
| @@ -3026,12 +3003,12 @@ ADD2_DY yvec5, yvec15, yvec15; | |||||
| EXTRA_DY $1, yvec15, xvec7; | EXTRA_DY $1, yvec15, xvec7; | ||||
| #### Writing Back #### | #### Writing Back #### | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| LDL_DX 0*SIZE(C0), xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0; | |||||
| LDL_DX 2*SIZE(C0), xvec1; | |||||
| LDH_DX 3*SIZE(C0), xvec1; | |||||
| ADD_DX xvec0, xvec15; | |||||
| ADD_DX xvec1, xvec7; | |||||
| LDL_DX 0*SIZE(C0), xvec0, xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0, xvec0; | |||||
| LDL_DX 2*SIZE(C0), xvec1, xvec1; | |||||
| LDH_DX 3*SIZE(C0), xvec1, xvec1; | |||||
| ADD_DX xvec0, xvec15, xvec15; | |||||
| ADD_DX xvec1, xvec7, xvec7; | |||||
| #endif | #endif | ||||
| STL_DX xvec15, 0*SIZE(C0); | STL_DX xvec15, 0*SIZE(C0); | ||||
| STH_DX xvec15, 1*SIZE(C0); | STH_DX xvec15, 1*SIZE(C0); | ||||
| @@ -3084,43 +3061,43 @@ ALIGN_5 | |||||
| .L331_bodyB: | .L331_bodyB: | ||||
| LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
| BROAD_DX 0*SIZE(ptrbb), xvec2; | BROAD_DX 0*SIZE(ptrbb), xvec2; | ||||
| MUL_DX xvec0, xvec2; | |||||
| ADD1_DX xvec2, xvec15; | |||||
| MUL_DX xvec0, xvec2, xvec2; | |||||
| ADD1_DX xvec2, xvec15, xvec15; | |||||
| SHUF_DX $0x4e, xvec0, xvec1; | SHUF_DX $0x4e, xvec0, xvec1; | ||||
| BROAD_DX 1*SIZE(ptrbb), xvec3; | BROAD_DX 1*SIZE(ptrbb), xvec3; | ||||
| MUL_DX xvec1, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15; | |||||
| MUL_DX xvec1, xvec3, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15, xvec15; | |||||
| LD_DX 2*SIZE(ptrba), xvec0; | LD_DX 2*SIZE(ptrba), xvec0; | ||||
| BROAD_DX 2*SIZE(ptrbb), xvec2; | BROAD_DX 2*SIZE(ptrbb), xvec2; | ||||
| MUL_DX xvec0, xvec2; | |||||
| ADD1_DX xvec2, xvec15; | |||||
| MUL_DX xvec0, xvec2, xvec2; | |||||
| ADD1_DX xvec2, xvec15, xvec15; | |||||
| SHUF_DX $0x4e, xvec0, xvec1; | SHUF_DX $0x4e, xvec0, xvec1; | ||||
| BROAD_DX 3*SIZE(ptrbb), xvec3; | BROAD_DX 3*SIZE(ptrbb), xvec3; | ||||
| MUL_DX xvec1, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15; | |||||
| MUL_DX xvec1, xvec3, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15, xvec15; | |||||
| LD_DX 4*SIZE(ptrba), xvec0; | LD_DX 4*SIZE(ptrba), xvec0; | ||||
| BROAD_DX 4*SIZE(ptrbb), xvec2; | BROAD_DX 4*SIZE(ptrbb), xvec2; | ||||
| MUL_DX xvec0, xvec2; | |||||
| ADD1_DX xvec2, xvec15; | |||||
| MUL_DX xvec0, xvec2, xvec2; | |||||
| ADD1_DX xvec2, xvec15, xvec15; | |||||
| SHUF_DX $0x4e, xvec0, xvec1; | SHUF_DX $0x4e, xvec0, xvec1; | ||||
| BROAD_DX 5*SIZE(ptrbb), xvec3; | BROAD_DX 5*SIZE(ptrbb), xvec3; | ||||
| MUL_DX xvec1, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15; | |||||
| MUL_DX xvec1, xvec3, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15, xvec15; | |||||
| LD_DX 6*SIZE(ptrba), xvec0; | LD_DX 6*SIZE(ptrba), xvec0; | ||||
| BROAD_DX 6*SIZE(ptrbb), xvec2; | BROAD_DX 6*SIZE(ptrbb), xvec2; | ||||
| MUL_DX xvec0, xvec2; | |||||
| ADD1_DX xvec2, xvec15; | |||||
| MUL_DX xvec0, xvec2, xvec2; | |||||
| ADD1_DX xvec2, xvec15, xvec15; | |||||
| SHUF_DX $0x4e, xvec0, xvec1; | SHUF_DX $0x4e, xvec0, xvec1; | ||||
| BROAD_DX 7*SIZE(ptrbb), xvec3; | BROAD_DX 7*SIZE(ptrbb), xvec3; | ||||
| MUL_DX xvec1, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15; | |||||
| MUL_DX xvec1, xvec3, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15, xvec15; | |||||
| ADDQ $8*SIZE, ptrba; | ADDQ $8*SIZE, ptrba; | ||||
| ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
| DECQ k; | DECQ k; | ||||
| @@ -3137,23 +3114,23 @@ ALIGN_5 | |||||
| .L332_bodyB: | .L332_bodyB: | ||||
| LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
| BROAD_DX 0*SIZE(ptrbb), xvec2; | BROAD_DX 0*SIZE(ptrbb), xvec2; | ||||
| MUL_DX xvec0, xvec2; | |||||
| ADD1_DX xvec2, xvec15; | |||||
| MUL_DX xvec0, xvec2, xvec2; | |||||
| ADD1_DX xvec2, xvec15, xvec15; | |||||
| SHUF_DX $0x4e, xvec0, xvec1; | SHUF_DX $0x4e, xvec0, xvec1; | ||||
| BROAD_DX 1*SIZE(ptrbb), xvec3; | BROAD_DX 1*SIZE(ptrbb), xvec3; | ||||
| MUL_DX xvec1, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15; | |||||
| MUL_DX xvec1, xvec3, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15, xvec15; | |||||
| LD_DX 2*SIZE(ptrba), xvec0; | LD_DX 2*SIZE(ptrba), xvec0; | ||||
| BROAD_DX 2*SIZE(ptrbb), xvec2; | BROAD_DX 2*SIZE(ptrbb), xvec2; | ||||
| MUL_DX xvec0, xvec2; | |||||
| ADD1_DX xvec2, xvec15; | |||||
| MUL_DX xvec0, xvec2, xvec2; | |||||
| ADD1_DX xvec2, xvec15, xvec15; | |||||
| SHUF_DX $0x4e, xvec0, xvec1; | SHUF_DX $0x4e, xvec0, xvec1; | ||||
| BROAD_DX 3*SIZE(ptrbb), xvec3; | BROAD_DX 3*SIZE(ptrbb), xvec3; | ||||
| MUL_DX xvec1, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15; | |||||
| MUL_DX xvec1, xvec3, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15, xvec15; | |||||
| ADDQ $4*SIZE, ptrba; | ADDQ $4*SIZE, ptrba; | ||||
| ADDQ $4*SIZE, ptrbb; | ADDQ $4*SIZE, ptrbb; | ||||
| @@ -3168,13 +3145,13 @@ ALIGN_5 | |||||
| .L333_bodyB: | .L333_bodyB: | ||||
| LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
| BROAD_DX 0*SIZE(ptrbb), xvec2; | BROAD_DX 0*SIZE(ptrbb), xvec2; | ||||
| MUL_DX xvec0, xvec2; | |||||
| ADD1_DX xvec2, xvec15; | |||||
| MUL_DX xvec0, xvec2, xvec2; | |||||
| ADD1_DX xvec2, xvec15, xvec15; | |||||
| SHUF_DX $0x4e, xvec0, xvec1; | SHUF_DX $0x4e, xvec0, xvec1; | ||||
| BROAD_DX 1*SIZE(ptrbb), xvec3; | BROAD_DX 1*SIZE(ptrbb), xvec3; | ||||
| MUL_DX xvec1, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15; | |||||
| MUL_DX xvec1, xvec3, xvec3; | |||||
| ADDSUB_DX xvec3, xvec15, xvec15; | |||||
| ADDQ $2*SIZE, ptrba; | ADDQ $2*SIZE, ptrba; | ||||
| ADDQ $2*SIZE, ptrbb; | ADDQ $2*SIZE, ptrbb; | ||||
| @@ -3182,14 +3159,14 @@ ADDQ $2*SIZE, ptrbb; | |||||
| #### Handle #### | #### Handle #### | ||||
| XOR_DY yvec7, yvec7, yvec7; | XOR_DY yvec7, yvec7, yvec7; | ||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | ||||
| ADDSUB_DX xvec15, xvec7; | |||||
| ADDSUB_DX xvec15, xvec7, xvec7; | |||||
| MOV_DX xvec7, xvec15; | MOV_DX xvec7, xvec15; | ||||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | ||||
| SUB_DX xvec15, xvec7; | |||||
| SUB_DX xvec15, xvec7, xvec7; | |||||
| MOV_DX xvec7, xvec15; | MOV_DX xvec7, xvec15; | ||||
| #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) | #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
| SHUF_DX $0x4e, xvec15, xvec15; | SHUF_DX $0x4e, xvec15, xvec15; | ||||
| ADDSUB_DX xvec15, xvec7; | |||||
| ADDSUB_DX xvec15, xvec7, xvec7; | |||||
| MOV_DX xvec7, xvec15; | MOV_DX xvec7, xvec15; | ||||
| SHUF_DX $0x4e, xvec15, xvec15; | SHUF_DX $0x4e, xvec15, xvec15; | ||||
| #endif | #endif | ||||
| @@ -3199,14 +3176,14 @@ BROAD_DX MEMALPHA_R,xvec7; | |||||
| BROAD_DX MEMALPHA_I,xvec6; | BROAD_DX MEMALPHA_I,xvec6; | ||||
| #### Multiply Alpha #### | #### Multiply Alpha #### | ||||
| SHUF_DX $0x4e, xvec15, xvec5; | SHUF_DX $0x4e, xvec15, xvec5; | ||||
| MUL_DX xvec7, xvec15; | |||||
| MUL_DX xvec6, xvec5; | |||||
| ADDSUB_DX xvec5, xvec15; | |||||
| MUL_DX xvec7, xvec15, xvec15; | |||||
| MUL_DX xvec6, xvec5, xvec5; | |||||
| ADDSUB_DX xvec5, xvec15, xvec15; | |||||
| #### Writing back #### | #### Writing back #### | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| LDL_DX 0*SIZE(C0), xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0; | |||||
| ADD_DX xvec0, xvec15; | |||||
| LDL_DX 0*SIZE(C0), xvec0, xvec0; | |||||
| LDH_DX 1*SIZE(C0), xvec0, xvec0; | |||||
| ADD_DX xvec0, xvec15, xvec15; | |||||
| #endif | #endif | ||||
| STL_DX xvec15, 0*SIZE(C0); | STL_DX xvec15, 0*SIZE(C0); | ||||
| STH_DX xvec15, 1*SIZE(C0); | STH_DX xvec15, 1*SIZE(C0); | ||||
| @@ -3237,6 +3214,9 @@ movq 24(%rsp), %r13; | |||||
| movq 32(%rsp), %r14; | movq 32(%rsp), %r14; | ||||
| movq 40(%rsp), %r15; | movq 40(%rsp), %r15; | ||||
| vzeroupper | |||||
| #ifdef WINDOWS_ABI | #ifdef WINDOWS_ABI | ||||
| movq 48(%rsp), %rdi | movq 48(%rsp), %rdi | ||||
| movq 56(%rsp), %rsi | movq 56(%rsp), %rsi | ||||