Browse Source

Fixed overflow internal buffer bug of (s/d/c/z)gemv on x86.

tags/v0.2.7
wangqian 13 years ago
parent
commit
6a72840945
8 changed files with 257 additions and 93 deletions
  1. +5
    -5
      kernel/x86/gemv_n_sse.S
  2. +3
    -3
      kernel/x86/gemv_n_sse2.S
  3. +9
    -19
      kernel/x86/gemv_t_sse.S
  4. +13
    -16
      kernel/x86/gemv_t_sse2.S
  5. +56
    -12
      kernel/x86/zgemv_n_sse.S
  6. +55
    -11
      kernel/x86/zgemv_n_sse2.S
  7. +58
    -13
      kernel/x86/zgemv_t_sse.S
  8. +58
    -14
      kernel/x86/zgemv_t_sse2.S

+ 5
- 5
kernel/x86/gemv_n_sse.S View File

@@ -101,10 +101,10 @@
#define Y 36 + STACKSIZE+ARGS(%esp) #define Y 36 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) #define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE+ARGS(%esp) #define BUFFER 44 + STACKSIZE+ARGS(%esp)

#define MMM 0+ARGS(%esp) #define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp) #define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp) #define AA 8+ARGS(%esp)
#define LDAX 12+ARGS(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@@ -153,8 +153,8 @@


movl YY,J movl YY,J
movl J,Y movl J,Y
movl STACK_LDA, LDA


movl STACK_LDA, LDA
movl STACK_X, X movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX


@@ -688,9 +688,9 @@
movl M,J movl M,J
leal (,J,SIZE),%eax leal (,J,SIZE),%eax
addl %eax,AA addl %eax,AA
movl YY,J
addl %eax,J
movl J,YY
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t jmp .L0t
ALIGN_4 ALIGN_4




+ 3
- 3
kernel/x86/gemv_n_sse2.S View File

@@ -714,9 +714,9 @@
movl M,J movl M,J
leal (,J,SIZE),%eax leal (,J,SIZE),%eax
addl %eax,AA addl %eax,AA
movl YY,J
addl %eax,J
movl J,YY
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t jmp .L0t
ALIGN_4 ALIGN_4




+ 9
- 19
kernel/x86/gemv_t_sse.S View File

@@ -102,11 +102,9 @@
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) #define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE+ARGS(%esp) #define BUFFER 44 + STACKSIZE+ARGS(%esp)


#define MMM 0+STACKSIZE(%esp)
#define NN 4+STACKSIZE(%esp)
#define AA 8+STACKSIZE(%esp)
#define LDAX 12+STACKSIZE(%esp)
#define XX 16+STACKSIZE(%esp)
#define MMM 0+ARGS(%esp)
#define AA 4+ARGS(%esp)
#define XX 8+ARGS(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@@ -129,12 +127,8 @@


PROFCODE PROFCODE


movl STACK_LDA, LDA
movl LDA,LDAX # backup LDA
movl STACK_X, X movl STACK_X, X
movl X,XX movl X,XX
movl N,J
movl J,NN # backup N
movl A,J movl A,J
movl J,AA # backup A movl J,AA # backup A
movl M,J movl M,J
@@ -144,7 +138,6 @@
addl $1,J addl $1,J
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
subl $8, J # Don't use last 8 float in the buffer. subl $8, J # Don't use last 8 float in the buffer.
# Now, split M by block J
subl J,MMM # MMM=MMM-J subl J,MMM # MMM=MMM-J
movl J,M movl J,M
jge .L00t jge .L00t
@@ -159,13 +152,10 @@
movl AA,%eax movl AA,%eax
movl %eax,A # mov AA to A movl %eax,A # mov AA to A


movl NN,%eax
movl %eax,N # reset N


movl LDAX, LDA # reset LDA
movl XX,X
movl XX,%eax
movl %eax,X


movl STACK_LDA, LDA
movl STACK_INCX, INCX movl STACK_INCX, INCX
movl STACK_INCY, INCY movl STACK_INCY, INCY


@@ -688,9 +678,9 @@
movl M,J movl M,J
leal (,J,SIZE),%eax leal (,J,SIZE),%eax
addl %eax,AA addl %eax,AA
movl XX,J
addl %eax,J
movl J,XX
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t jmp .L0t
ALIGN_4 ALIGN_4




+ 13
- 16
kernel/x86/gemv_t_sse2.S View File

@@ -76,7 +76,7 @@
#endif #endif


#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 16
#define ARGS 20


#define M 4 + STACKSIZE+ARGS(%esp) #define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp)
@@ -89,10 +89,9 @@
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) #define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp) #define BUFFER 48 + STACKSIZE+ARGS(%esp)


#define MMM 0+STACKSIZE(%esp)
#define AA 4+STACKSIZE(%esp)
#define LDAX 8+STACKSIZE(%esp)
#define NN 12+STACKSIZE(%esp)
#define MMM 0+ARGS(%esp)
#define AA 4+ARGS(%esp)
#define XX 8+ARGS(%esp)


#define I %eax #define I %eax
#define J %ebx #define J %ebx
@@ -117,10 +116,8 @@
PROFCODE PROFCODE




movl STACK_LDA, LDA
movl LDA,LDAX # backup LDA
movl N,J
movl J,NN # backup N
movl STACK_X, X
movl X,XX
movl A,J movl A,J
movl J,AA # backup A movl J,AA # backup A
movl M,J movl M,J
@@ -130,7 +127,6 @@
addl $1,J addl $1,J
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
subl $4, J # Don't use last 4 double in the buffer. subl $4, J # Don't use last 4 double in the buffer.
# Now, split M by block J
subl J,MMM # MMM=MMM-J subl J,MMM # MMM=MMM-J
movl J,M movl J,M
jge .L00t jge .L00t
@@ -142,15 +138,13 @@
movl %eax,M movl %eax,M


.L00t: .L00t:
movl XX,%eax
movl %eax, X

movl AA,%eax movl AA,%eax
movl %eax,A # mov AA to A movl %eax,A # mov AA to A


movl NN,%eax
movl %eax,N # reset N


movl LDAX, LDA # reset LDA
movl STACK_X, X
movl STACK_LDA, LDA
movl STACK_INCX, INCX movl STACK_INCX, INCX
movl STACK_INCY, INCY movl STACK_INCY, INCY


@@ -605,6 +599,9 @@
movl M,J movl M,J
leal (,J,SIZE),%eax leal (,J,SIZE),%eax
addl %eax,AA addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t jmp .L0t
ALIGN_4 ALIGN_4




+ 56
- 12
kernel/x86/zgemv_n_sse.S View File

@@ -89,18 +89,23 @@
#endif #endif


#define STACKSIZE 16 #define STACKSIZE 16

#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 20 + STACKSIZE(%esp)
#define A 24 + STACKSIZE(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp)
#define STACK_X 32 + STACKSIZE(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp)
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
#define ARGS 20

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)

#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@@ -123,6 +128,7 @@


PROLOGUE PROLOGUE


subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@@ -130,6 +136,33 @@


PROFCODE PROFCODE


movl Y,J
movl J,YY
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_3

movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M

.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y

movl STACK_LDA, LDA movl STACK_LDA, LDA
movl STACK_X, X movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX
@@ -595,10 +628,21 @@
ALIGN_3 ALIGN_3


.L999: .L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_3

.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret


EPILOGUE EPILOGUE

+ 55
- 11
kernel/x86/zgemv_n_sse2.S View File

@@ -76,18 +76,23 @@
#endif #endif


#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 16

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
#define A 32 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
#define Y 48 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
#define BUFFER 56 + STACKSIZE+ARGS(%esp)
#define MMM 0 + ARGS(%esp)
#define YY 4 + ARGS(%esp)
#define AA 8 + ARGS(%esp)


#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 24 + STACKSIZE(%esp)
#define A 32 + STACKSIZE(%esp)
#define STACK_LDA 36 + STACKSIZE(%esp)
#define STACK_X 40 + STACKSIZE(%esp)
#define STACK_INCX 44 + STACKSIZE(%esp)
#define Y 48 + STACKSIZE(%esp)
#define STACK_INCY 52 + STACKSIZE(%esp)
#define BUFFER 56 + STACKSIZE(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@@ -110,6 +115,7 @@


PROLOGUE PROLOGUE


subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@@ -117,6 +123,33 @@


PROFCODE PROFCODE


movl Y,J
movl J,YY
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $18,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_3

movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M

.L00t:
movl AA,%eax
movl %eax,A

movl YY,J
movl J,Y

movl STACK_LDA, LDA movl STACK_LDA, LDA
movl STACK_X, X movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX
@@ -458,10 +491,21 @@
ALIGN_3 ALIGN_3


.L999: .L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_3

.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret


EPILOGUE EPILOGUE

+ 58
- 13
kernel/x86/zgemv_t_sse.S View File

@@ -89,18 +89,23 @@
#endif #endif


#define STACKSIZE 16 #define STACKSIZE 16

#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 20 + STACKSIZE(%esp)
#define A 24 + STACKSIZE(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp)
#define STACK_X 32 + STACKSIZE(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp)
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
#define ARGS 20

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)

#define MMM 0+ARGS(%esp)
#define XX 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@@ -123,6 +128,7 @@


PROLOGUE PROLOGUE


subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@@ -130,8 +136,35 @@


PROFCODE PROFCODE


movl STACK_LDA, LDA
movl STACK_X, X movl STACK_X, X
movl X,XX
movl A,J
movl J,AA #backup A
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl $8,J
subl J,MMM #MMM-=J
movl J,M
jge .L00t
ALIGN_4

movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M

.L00t:
movl AA,%eax
movl %eax,A

movl XX,%eax
movl %eax,X

movl STACK_LDA,LDA
movl STACK_INCX, INCX movl STACK_INCX, INCX
movl STACK_INCY, INCY movl STACK_INCY, INCY


@@ -513,10 +546,22 @@
ALIGN_4 ALIGN_4
.L999: .L999:
movl M,%eax
sall $ZBASE_SHIFT, %eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4

.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp

addl $ARGS,%esp
ret ret


EPILOGUE EPILOGUE

+ 58
- 14
kernel/x86/zgemv_t_sse2.S View File

@@ -76,19 +76,24 @@
#endif #endif


#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 20

#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
#define A 32 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
#define Y 48 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
#define BUFFER 56 + STACKSIZE+ARGS(%esp)

#define MMM 0 + ARGS(%esp)
#define AA 4 + ARGS(%esp)
#define XX 8 + ARGS(%esp)


#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 24 + STACKSIZE(%esp)
#define A 32 + STACKSIZE(%esp)
#define STACK_LDA 36 + STACKSIZE(%esp)
#define STACK_X 40 + STACKSIZE(%esp)
#define STACK_INCX 44 + STACKSIZE(%esp)
#define Y 48 + STACKSIZE(%esp)
#define STACK_INCY 52 + STACKSIZE(%esp)
#define BUFFER 56 + STACKSIZE(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx


@@ -110,6 +115,7 @@


PROLOGUE PROLOGUE


subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@@ -117,8 +123,35 @@


PROFCODE PROFCODE


movl STACK_X, X
movl X, XX
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $18,J
subl $4,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_4

movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax, M

.L00t:
movl XX, %eax
movl %eax, X

movl AA,%eax
movl %eax,A

movl STACK_LDA, LDA movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX
movl STACK_INCY, INCY movl STACK_INCY, INCY


@@ -188,7 +221,7 @@
movl Y, Y1 movl Y, Y1


movl N, J movl N, J
ALIGN_3
ALIGN_4


.L11: .L11:
movl BUFFER, X movl BUFFER, X
@@ -395,10 +428,21 @@
ALIGN_4 ALIGN_4
.L999: .L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4

.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret


EPILOGUE EPILOGUE

Loading…
Cancel
Save