| @@ -171,11 +171,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| #define MMXSTORE movd | |||
| #endif | |||
| #if defined(SANDYBRIDGE) || defined(HASWELL) | |||
| //Enable some optimazation for nehalem. | |||
| #define NEHALEM_OPTIMIZATION | |||
| #endif | |||
| #if defined(PILEDRIVER) || defined(BULLDOZER) | |||
| //Enable some optimazation for barcelona. | |||
| #define BARCELONA_OPTIMIZATION | |||
| @@ -306,12 +301,25 @@ REALNAME: | |||
| #define PROFCODE | |||
| #endif | |||
| #if defined(C_PATHSCALE) || defined(OS_DARWIN) | |||
| #define EPILOGUE \ | |||
| .size REALNAME, .-REALNAME; \ | |||
| .section .note.GNU-stack,"",@progbits | |||
| #else | |||
| #define EPILOGUE \ | |||
| .size REALNAME, .-REALNAME; \ | |||
| .size REALNAME, .-REALNAME; \ | |||
| .section .note.GNU-stack,"",%progbits | |||
| #endif | |||
| #endif | |||
| #ifdef XDOUBLE | |||
| #define FLD fldt | |||
| #define FST fstpt | |||
| @@ -218,12 +218,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| #ifdef ASSEMBLER | |||
| #if defined(SANDYBRIDGE) || defined(HASWELL) | |||
| //Enable some optimazation for nehalem. | |||
| #define NEHALEM_OPTIMIZATION | |||
| #endif | |||
| #if defined(PILEDRIVER) || defined(BULLDOZER) | |||
| //Enable some optimazation for barcelona. | |||
| #define BARCELONA_OPTIMIZATION | |||
| @@ -378,10 +372,20 @@ REALNAME: | |||
| #define PROFCODE | |||
| #endif | |||
| #if defined(C_PATHSCALE) || defined(OS_DARWIN) | |||
| #define EPILOGUE \ | |||
| .size REALNAME, .-REALNAME; \ | |||
| .section .note.GNU-stack,"",@progbits | |||
| #else | |||
| #define EPILOGUE \ | |||
| .size REALNAME, .-REALNAME; \ | |||
| .section .note.GNU-stack,"",%progbits | |||
| #endif | |||
| #endif | |||
| @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| /********************************************************************* | |||
| * 2013/10/28 Saar | |||
| * 2013/11/13 Saar | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| @@ -138,43 +138,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| .macro VFMADDPS_R y0,y1,y2 | |||
| vfmaddps \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 | |||
| .macro VFMADDPS_I y0,y1,y2 | |||
| vfmaddps \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 | |||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| .macro VFMADDPS_R y0,y1,y2 | |||
| vfnmaddps \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 | |||
| .macro VFMADDPS_I y0,y1,y2 | |||
| vfmaddps \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 | |||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| .macro VFMADDPS_R y0,y1,y2 | |||
| vfmaddps \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 | |||
| .macro VFMADDPS_I y0,y1,y2 | |||
| vfnmaddps \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 | |||
| #else | |||
| .macro VFMADDPS_R y0,y1,y2 | |||
| vfnmaddps \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 | |||
| .macro VFMADDPS_I y0,y1,y2 | |||
| vfnmaddps \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 | |||
| #endif | |||
| @@ -182,43 +166,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| .macro VFMADDPS_R y0,y1,y2 | |||
| vfmadd231ps \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 | |||
| .macro VFMADDPS_I y0,y1,y2 | |||
| vfmadd231ps \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 | |||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| .macro VFMADDPS_R y0,y1,y2 | |||
| vfnmadd231ps \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 | |||
| .macro VFMADDPS_I y0,y1,y2 | |||
| vfmadd231ps \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 | |||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| .macro VFMADDPS_R y0,y1,y2 | |||
| vfmadd231ps \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 | |||
| .macro VFMADDPS_I y0,y1,y2 | |||
| vfnmadd231ps \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 | |||
| #else | |||
| .macro VFMADDPS_R y0,y1,y2 | |||
| vfnmadd231ps \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 | |||
| .macro VFMADDPS_I y0,y1,y2 | |||
| vfnmadd231ps \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 | |||
| #endif | |||
| @@ -234,18 +202,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 | |||
| vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 | |||
| VFMADDPS_R %ymm8,%ymm4,%ymm0 | |||
| VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) | |||
| vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 | |||
| VFMADDPS_R %ymm12,%ymm4,%ymm1 | |||
| VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) | |||
| vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 | |||
| VFMADDPS_I %ymm9,%ymm5,%ymm0 | |||
| VFMADDPS_I %ymm13,%ymm5,%ymm1 | |||
| VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) | |||
| VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) | |||
| vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 | |||
| VFMADDPS_R %ymm10,%ymm6,%ymm0 | |||
| VFMADDPS_R %ymm14,%ymm6,%ymm1 | |||
| VFMADDPS_R( %ymm10,%ymm6,%ymm0 ) | |||
| VFMADDPS_R( %ymm14,%ymm6,%ymm1 ) | |||
| vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 | |||
| VFMADDPS_I %ymm11,%ymm7,%ymm0 | |||
| VFMADDPS_I %ymm15,%ymm7,%ymm1 | |||
| VFMADDPS_I( %ymm11,%ymm7,%ymm0 ) | |||
| VFMADDPS_I( %ymm15,%ymm7,%ymm1 ) | |||
| addq $4 , BI | |||
| addq $16, %rax | |||
| .endm | |||
| @@ -338,18 +306,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x2_SUB | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 | |||
| VFMADDPS_R %xmm8,%xmm4,%xmm0 | |||
| VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) | |||
| vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 | |||
| VFMADDPS_R %xmm12,%xmm4,%xmm1 | |||
| VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) | |||
| vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 | |||
| VFMADDPS_I %xmm9,%xmm5,%xmm0 | |||
| VFMADDPS_I %xmm13,%xmm5,%xmm1 | |||
| VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) | |||
| VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) | |||
| vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 | |||
| VFMADDPS_R %xmm10,%xmm6,%xmm0 | |||
| VFMADDPS_R %xmm14,%xmm6,%xmm1 | |||
| VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) | |||
| VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) | |||
| vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 | |||
| VFMADDPS_I %xmm11,%xmm7,%xmm0 | |||
| VFMADDPS_I %xmm15,%xmm7,%xmm1 | |||
| VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) | |||
| VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) | |||
| addq $4, BI | |||
| addq $8, %rax | |||
| .endm | |||
| @@ -437,13 +405,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x2_SUB | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 | |||
| VFMADDPS_R %xmm8,%xmm4,%xmm0 | |||
| VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) | |||
| vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 | |||
| VFMADDPS_I %xmm9,%xmm5,%xmm0 | |||
| VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) | |||
| vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 | |||
| VFMADDPS_R %xmm10,%xmm6,%xmm0 | |||
| VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) | |||
| vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 | |||
| VFMADDPS_I %xmm11,%xmm7,%xmm0 | |||
| VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) | |||
| addq $4, BI | |||
| addq $4, %rax | |||
| .endm | |||
| @@ -509,13 +477,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL1x2_SUB | |||
| vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 | |||
| VFMADDPS_R %xmm8,%xmm4,%xmm0 | |||
| VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) | |||
| vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 | |||
| VFMADDPS_I %xmm9,%xmm5,%xmm0 | |||
| VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) | |||
| vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 | |||
| VFMADDPS_R %xmm10,%xmm6,%xmm0 | |||
| VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) | |||
| vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 | |||
| VFMADDPS_I %xmm11,%xmm7,%xmm0 | |||
| VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) | |||
| addq $4, BI | |||
| addq $2, %rax | |||
| .endm | |||
| @@ -583,11 +551,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 | |||
| vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 | |||
| VFMADDPS_R %ymm8,%ymm4,%ymm0 | |||
| VFMADDPS_R %ymm12,%ymm4,%ymm1 | |||
| VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) | |||
| VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) | |||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 | |||
| VFMADDPS_I %ymm9,%ymm5,%ymm0 | |||
| VFMADDPS_I %ymm13,%ymm5,%ymm1 | |||
| VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) | |||
| VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) | |||
| addq $2 , BI | |||
| addq $16, %rax | |||
| .endm | |||
| @@ -654,12 +622,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x1_SUB | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 | |||
| VFMADDPS_R %xmm8,%xmm4,%xmm0 | |||
| VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) | |||
| vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 | |||
| VFMADDPS_R %xmm12,%xmm4,%xmm1 | |||
| VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) | |||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 | |||
| VFMADDPS_I %xmm9,%xmm5,%xmm0 | |||
| VFMADDPS_I %xmm13,%xmm5,%xmm1 | |||
| VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) | |||
| VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) | |||
| addq $2, BI | |||
| addq $8, %rax | |||
| .endm | |||
| @@ -723,9 +691,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x1_SUB | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 | |||
| VFMADDPS_R %xmm8,%xmm4,%xmm0 | |||
| VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) | |||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 | |||
| VFMADDPS_I %xmm9,%xmm5,%xmm0 | |||
| VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) | |||
| addq $2, BI | |||
| addq $4, %rax | |||
| .endm | |||
| @@ -778,9 +746,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL1x1_SUB | |||
| vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 | |||
| VFMADDPS_R %xmm8,%xmm4,%xmm0 | |||
| VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) | |||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 | |||
| VFMADDPS_I %xmm9,%xmm5,%xmm0 | |||
| VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) | |||
| addq $2, BI | |||
| addq $2, %rax | |||
| .endm | |||
| @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| /********************************************************************* | |||
| * 2013/10/28 Saar | |||
| * 2013/11/13 Saar | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| @@ -131,23 +131,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(BULLDOZER) | |||
| .macro VFMADD231PS_ y0,y1,y2 | |||
| vfmaddps \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 | |||
| .macro VFMADD231SS_ x0,x1,x2 | |||
| vfmaddss \x0,\x1,\x2,\x0 | |||
| .endm | |||
| #define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 | |||
| #else | |||
| .macro VFMADD231PS_ y0,y1,y2 | |||
| vfmadd231ps \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 | |||
| .macro VFMADD231SS_ x0,x1,x2 | |||
| vfmadd231ss \x1,\x2,\x0 | |||
| .endm | |||
| #define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 | |||
| #endif | |||
| @@ -164,16 +156,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 | |||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 | |||
| VFMADD231PS_ %ymm4,%ymm2,%ymm0 | |||
| VFMADD231PS_ %ymm5,%ymm2,%ymm1 | |||
| VFMADD231PS_ %ymm6,%ymm3,%ymm0 | |||
| VFMADD231PS_ %ymm7,%ymm3,%ymm1 | |||
| VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) | |||
| VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) | |||
| VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) | |||
| VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) | |||
| vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 | |||
| vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 | |||
| VFMADD231PS_ %ymm8,%ymm2,%ymm0 | |||
| VFMADD231PS_ %ymm9,%ymm2,%ymm1 | |||
| VFMADD231PS_ %ymm10,%ymm3,%ymm0 | |||
| VFMADD231PS_ %ymm11,%ymm3,%ymm1 | |||
| VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) | |||
| VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) | |||
| VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) | |||
| VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) | |||
| addq $4 , BI | |||
| addq $16, %rax | |||
| .endm | |||
| @@ -235,12 +227,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 | |||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 | |||
| VFMADD231PS_ %ymm4,%ymm2,%ymm0 | |||
| VFMADD231PS_ %ymm6,%ymm3,%ymm0 | |||
| VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) | |||
| VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) | |||
| vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 | |||
| vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 | |||
| VFMADD231PS_ %ymm8,%ymm2,%ymm0 | |||
| VFMADD231PS_ %ymm10,%ymm3,%ymm0 | |||
| VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) | |||
| VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) | |||
| addq $4 , BI | |||
| addq $8 , %rax | |||
| .endm | |||
| @@ -279,12 +271,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 | |||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 | |||
| VFMADD231PS_ %xmm4,%xmm2,%xmm0 | |||
| VFMADD231PS_ %xmm6,%xmm3,%xmm0 | |||
| VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) | |||
| VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) | |||
| vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 | |||
| vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 | |||
| VFMADD231PS_ %xmm8,%xmm2,%xmm0 | |||
| VFMADD231PS_ %xmm10,%xmm3,%xmm0 | |||
| VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) | |||
| VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) | |||
| addq $4 , BI | |||
| addq $4 , %rax | |||
| .endm | |||
| @@ -323,16 +315,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 | |||
| vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 | |||
| vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 | |||
| VFMADD231SS_ %xmm4,%xmm2,%xmm0 | |||
| VFMADD231SS_ %xmm5,%xmm2,%xmm1 | |||
| VFMADD231SS_ %xmm6,%xmm3,%xmm0 | |||
| VFMADD231SS_ %xmm7,%xmm3,%xmm1 | |||
| VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) | |||
| VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) | |||
| VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) | |||
| VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) | |||
| vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 | |||
| vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 | |||
| VFMADD231SS_ %xmm8,%xmm2,%xmm0 | |||
| VFMADD231SS_ %xmm9,%xmm2,%xmm1 | |||
| VFMADD231SS_ %xmm10,%xmm3,%xmm0 | |||
| VFMADD231SS_ %xmm11,%xmm3,%xmm1 | |||
| VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) | |||
| VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) | |||
| VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) | |||
| VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) | |||
| addq $4 , BI | |||
| addq $2, %rax | |||
| .endm | |||
| @@ -388,12 +380,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 | |||
| vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 | |||
| VFMADD231SS_ %xmm4,%xmm2,%xmm0 | |||
| VFMADD231SS_ %xmm6,%xmm3,%xmm0 | |||
| VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) | |||
| VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) | |||
| vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 | |||
| vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 | |||
| VFMADD231SS_ %xmm8,%xmm2,%xmm0 | |||
| VFMADD231SS_ %xmm10,%xmm3,%xmm0 | |||
| VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) | |||
| VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) | |||
| addq $4 , BI | |||
| addq $1, %rax | |||
| .endm | |||
| @@ -436,10 +428,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 | |||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 | |||
| VFMADD231PS_ %ymm4,%ymm2,%ymm0 | |||
| VFMADD231PS_ %ymm5,%ymm2,%ymm1 | |||
| VFMADD231PS_ %ymm6,%ymm3,%ymm0 | |||
| VFMADD231PS_ %ymm7,%ymm3,%ymm1 | |||
| VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) | |||
| VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) | |||
| VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) | |||
| VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) | |||
| addq $2 , BI | |||
| addq $16, %rax | |||
| .endm | |||
| @@ -480,8 +472,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 | |||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 | |||
| VFMADD231PS_ %ymm4,%ymm2,%ymm0 | |||
| VFMADD231PS_ %ymm6,%ymm3,%ymm0 | |||
| VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) | |||
| VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) | |||
| addq $2 , BI | |||
| addq $8 , %rax | |||
| .endm | |||
| @@ -513,8 +505,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 | |||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 | |||
| VFMADD231PS_ %xmm4,%xmm2,%xmm0 | |||
| VFMADD231PS_ %xmm6,%xmm3,%xmm0 | |||
| VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) | |||
| VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) | |||
| addq $2 , BI | |||
| addq $4 , %rax | |||
| .endm | |||
| @@ -546,10 +538,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 | |||
| vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 | |||
| vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 | |||
| VFMADD231SS_ %xmm4,%xmm2,%xmm0 | |||
| VFMADD231SS_ %xmm5,%xmm2,%xmm1 | |||
| VFMADD231SS_ %xmm6,%xmm3,%xmm0 | |||
| VFMADD231SS_ %xmm7,%xmm3,%xmm1 | |||
| VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) | |||
| VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) | |||
| VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) | |||
| VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) | |||
| addq $2 , BI | |||
| addq $2, %rax | |||
| .endm | |||
| @@ -589,8 +581,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 | |||
| vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 | |||
| VFMADD231SS_ %xmm4,%xmm2,%xmm0 | |||
| VFMADD231SS_ %xmm6,%xmm3,%xmm0 | |||
| VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) | |||
| VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) | |||
| addq $2 , BI | |||
| addq $1, %rax | |||
| .endm | |||
| @@ -625,8 +617,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 | |||
| vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 | |||
| VFMADD231PS_ %ymm4,%ymm2,%ymm0 | |||
| VFMADD231PS_ %ymm5,%ymm2,%ymm1 | |||
| VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) | |||
| VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) | |||
| addq $1 , BI | |||
| addq $16, %rax | |||
| .endm | |||
| @@ -656,7 +648,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL8x1_SUB | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 | |||
| VFMADD231PS_ %ymm4,%ymm2,%ymm0 | |||
| VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) | |||
| addq $1 , BI | |||
| addq $8 , %rax | |||
| .endm | |||
| @@ -684,7 +676,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x1_SUB | |||
| vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 | |||
| VFMADD231PS_ %xmm4,%xmm2,%xmm0 | |||
| VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) | |||
| addq $1 , BI | |||
| addq $4 , %rax | |||
| .endm | |||
| @@ -712,8 +704,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 | |||
| vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 | |||
| VFMADD231SS_ %xmm4,%xmm2,%xmm0 | |||
| VFMADD231SS_ %xmm5,%xmm2,%xmm1 | |||
| VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) | |||
| VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) | |||
| addq $1 , BI | |||
| addq $2, %rax | |||
| .endm | |||
| @@ -743,7 +735,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL1x1_SUB | |||
| vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 | |||
| VFMADD231SS_ %xmm4,%xmm2,%xmm0 | |||
| VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) | |||
| addq $1 , BI | |||
| addq $1, %rax | |||
| .endm | |||
| @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| /******************************************************************************** | |||
| * 2013/10/28 Saar | |||
| * 2013/11/13 Saar | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| @@ -137,43 +137,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| .macro VFMADDPD_R y0,y1,y2 | |||
| vfmaddpd \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 | |||
| .macro VFMADDPD_I y0,y1,y2 | |||
| vfmaddpd \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 | |||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| .macro VFMADDPD_R y0,y1,y2 | |||
| vfnmaddpd \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 | |||
| .macro VFMADDPD_I y0,y1,y2 | |||
| vfmaddpd \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 | |||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| .macro VFMADDPD_R y0,y1,y2 | |||
| vfmaddpd \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 | |||
| .macro VFMADDPD_I y0,y1,y2 | |||
| vfnmaddpd \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 | |||
| #else | |||
| .macro VFMADDPD_R y0,y1,y2 | |||
| vfnmaddpd \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 | |||
| .macro VFMADDPD_I y0,y1,y2 | |||
| vfnmaddpd \y0,\y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 | |||
| #endif | |||
| @@ -181,43 +165,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| .macro VFMADDPD_R y0,y1,y2 | |||
| vfmadd231pd \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 | |||
| .macro VFMADDPD_I y0,y1,y2 | |||
| vfmadd231pd \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 | |||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| .macro VFMADDPD_R y0,y1,y2 | |||
| vfnmadd231pd \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 | |||
| .macro VFMADDPD_I y0,y1,y2 | |||
| vfmadd231pd \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 | |||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| .macro VFMADDPD_R y0,y1,y2 | |||
| vfmadd231pd \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 | |||
| .macro VFMADDPD_I y0,y1,y2 | |||
| vfnmadd231pd \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 | |||
| #else | |||
| .macro VFMADDPD_R y0,y1,y2 | |||
| vfnmadd231pd \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 | |||
| .macro VFMADDPD_I y0,y1,y2 | |||
| vfnmadd231pd \y1,\y2,\y0 | |||
| .endm | |||
| #define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 | |||
| #endif | |||
| @@ -233,16 +201,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 | |||
| vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 | |||
| VFMADDPD_R %ymm8 ,%ymm4,%ymm0 | |||
| VFMADDPD_R %ymm12,%ymm4,%ymm1 | |||
| VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) | |||
| VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) | |||
| vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 | |||
| VFMADDPD_I %ymm9 ,%ymm5,%ymm0 | |||
| VFMADDPD_I %ymm13,%ymm5,%ymm1 | |||
| VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) | |||
| VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) | |||
| vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 | |||
| VFMADDPD_R %ymm10,%ymm6,%ymm0 | |||
| VFMADDPD_R %ymm14,%ymm6,%ymm1 | |||
| VFMADDPD_I %ymm11,%ymm7,%ymm0 | |||
| VFMADDPD_I %ymm15,%ymm7,%ymm1 | |||
| VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) | |||
| VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) | |||
| VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) | |||
| VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) | |||
| addq $4, BI | |||
| addq $8, %rax | |||
| @@ -337,17 +305,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 | |||
| vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 | |||
| VFMADDPD_R %xmm8,%xmm4,%xmm0 | |||
| VFMADDPD_R %xmm12,%xmm4,%xmm1 | |||
| VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) | |||
| VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) | |||
| vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 | |||
| VFMADDPD_I %xmm9,%xmm5,%xmm0 | |||
| VFMADDPD_I %xmm13,%xmm5,%xmm1 | |||
| VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) | |||
| VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) | |||
| vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 | |||
| VFMADDPD_R %xmm10,%xmm6,%xmm0 | |||
| VFMADDPD_R %xmm14,%xmm6,%xmm1 | |||
| VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) | |||
| VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) | |||
| vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 | |||
| VFMADDPD_I %xmm11,%xmm7,%xmm0 | |||
| VFMADDPD_I %xmm15,%xmm7,%xmm1 | |||
| VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) | |||
| VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) | |||
| addq $4, BI | |||
| addq $4, %rax | |||
| .endm | |||
| @@ -441,12 +409,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 | |||
| vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 | |||
| VFMADDPD_R %xmm8,%xmm4,%xmm0 | |||
| VFMADDPD_I %xmm9,%xmm5,%xmm0 | |||
| VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) | |||
| VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) | |||
| vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 | |||
| vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 | |||
| VFMADDPD_R %xmm10,%xmm6,%xmm0 | |||
| VFMADDPD_I %xmm11,%xmm7,%xmm0 | |||
| VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) | |||
| VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) | |||
| addq $4, BI | |||
| addq $2, %rax | |||
| .endm | |||
| @@ -513,10 +481,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 | |||
| vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 | |||
| vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 | |||
| VFMADDPD_R %ymm8 ,%ymm4,%ymm0 | |||
| VFMADDPD_R %ymm12,%ymm4,%ymm1 | |||
| VFMADDPD_I %ymm9 ,%ymm5,%ymm0 | |||
| VFMADDPD_I %ymm13,%ymm5,%ymm1 | |||
| VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) | |||
| VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) | |||
| VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) | |||
| VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) | |||
| addq $2, BI | |||
| addq $8, %rax | |||
| @@ -585,12 +553,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x1_SUB | |||
| vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 | |||
| VFMADDPD_R %xmm8,%xmm4,%xmm0 | |||
| VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) | |||
| vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 | |||
| VFMADDPD_R %xmm12,%xmm4,%xmm1 | |||
| VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) | |||
| vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 | |||
| VFMADDPD_I %xmm9,%xmm5,%xmm0 | |||
| VFMADDPD_I %xmm13,%xmm5,%xmm1 | |||
| VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) | |||
| VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) | |||
| addq $2, BI | |||
| addq $4, %rax | |||
| .endm | |||
| @@ -655,9 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL1x1_SUB | |||
| vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 | |||
| vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 | |||
| VFMADDPD_R %xmm8,%xmm4,%xmm0 | |||
| VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) | |||
| vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 | |||
| VFMADDPD_I %xmm9,%xmm5,%xmm0 | |||
| VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) | |||
| addq $2, BI | |||
| addq $2, %rax | |||
| .endm | |||