Browse Source

Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop

tags/v0.2.9.rc2^2
Zhang Xianyi 12 years ago
parent
commit
dd2d3e61ab
7 changed files with 144 additions and 107 deletions
  1. +14
    -0
      driver/others/blas_server.c
  2. +5
    -0
      driver/others/blas_server_omp.c
  3. +5
    -0
      driver/others/blas_server_win32.c
  4. +5
    -1
      driver/others/memory.c
  5. +4
    -4
      kernel/arm/KERNEL.ARMV6
  6. +6
    -6
      kernel/arm/KERNEL.ARMV7
  7. +105
    -96
      kernel/x86_64/dgemv_n.S

+ 14
- 0
driver/others/blas_server.c View File

@@ -83,6 +83,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#define ATTRIBUTE_SIZE 128 #define ATTRIBUTE_SIZE 128


extern void openblas_warning(int verbose, const char * msg);

/* This is a thread server model implementation. The threads are */ /* This is a thread server model implementation. The threads are */
/* spawned at first access to blas library, and still remains until */ /* spawned at first access to blas library, and still remains until */
/* destruction routine is called. The number of threads are */ /* destruction routine is called. The number of threads are */
@@ -921,5 +923,17 @@ int BLASFUNC(blas_thread_shutdown)(void){
return 0; return 0;
} }


/*
https://github.com/xianyi/OpenBLAS/issues/294
Use pthread_atfork to close blas_thread_server before fork.
Then, re-init blas_thread_server after fork at child and parent.
*/
void openblas_fork_handler()
{
int err;
err = pthread_atfork (BLASFUNC(blas_thread_shutdown), blas_thread_init, blas_thread_init);
if(err != 0)
openblas_warning(0, "OpenBLAS cannot install fork handler. You may meet hang after fork.\n");
}
#endif #endif



+ 5
- 0
driver/others/blas_server_omp.c View File

@@ -315,4 +315,9 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
return 0; return 0;
} }


void openblas_fork_handler()
{

}

#endif #endif

+ 5
- 0
driver/others/blas_server_win32.c View File

@@ -498,3 +498,8 @@ void openblas_set_num_threads(int num)
{ {
goto_set_num_threads(num); goto_set_num_threads(num);
} }

void openblas_fork_handler()
{

}

+ 5
- 1
driver/others/memory.c View File

@@ -1288,7 +1288,11 @@ void CONSTRUCTOR gotoblas_init(void) {
#ifdef SMP #ifdef SMP
if (blas_cpu_number == 0) blas_get_cpu_number(); if (blas_cpu_number == 0) blas_get_cpu_number();
#ifdef SMP_SERVER #ifdef SMP_SERVER
if (blas_server_avail == 0) blas_thread_init();
if (blas_server_avail == 0) {
blas_thread_init();
//deal with pthread and fork.
openblas_fork_handler();
}
#endif #endif
#endif #endif




+ 4
- 4
kernel/arm/KERNEL.ARMV6 View File

@@ -40,10 +40,10 @@ DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S


SCOPYKERNEL = scopy_vfp.S
DCOPYKERNEL = dcopy_vfp.S
CCOPYKERNEL = ccopy_vfp.S
ZCOPYKERNEL = zcopy_vfp.S
SCOPYKERNEL = copy.c
DCOPYKERNEL = copy.c
CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy.c


SDOTKERNEL = sdot_vfp.S SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S DDOTKERNEL = ddot_vfp.S


+ 6
- 6
kernel/arm/KERNEL.ARMV7 View File

@@ -45,10 +45,10 @@ DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S


SCOPYKERNEL = scopy_vfp.S
DCOPYKERNEL = dcopy_vfp.S
CCOPYKERNEL = ccopy_vfp.S
ZCOPYKERNEL = zcopy_vfp.S
SCOPYKERNEL = copy.c
DCOPYKERNEL = copy.c
CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy.c


SDOTKERNEL = sdot_vfp.S SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S DDOTKERNEL = ddot_vfp.S
@@ -66,12 +66,12 @@ CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S ZROTKERNEL = rot_vfp.S


SSCALKERNEL = scal_vfp.S SSCALKERNEL = scal_vfp.S
DSCALKERNEL = scal_vfp.S
DSCALKERNEL = scal.c
CSCALKERNEL = scal_vfp.S CSCALKERNEL = scal_vfp.S
ZSCALKERNEL = scal_vfp.S ZSCALKERNEL = scal_vfp.S


SGEMVNKERNEL = gemv_n_vfp.S SGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n.c
CGEMVNKERNEL = cgemv_n_vfp.S CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S ZGEMVNKERNEL = zgemv_n_vfp.S




+ 105
- 96
kernel/x86_64/dgemv_n.S View File

@@ -111,6 +111,9 @@
#define MM M #define MM M
#endif #endif


#define TMP_M %r15
#define Y2 %rbx
PROLOGUE PROLOGUE
PROFCODE PROFCODE


@@ -170,8 +173,9 @@
jge .L00t jge .L00t


movq MMM,M movq MMM,M
addq I,M
addq M, I
jle .L999x jle .L999x
movq I, M
.L00t: .L00t:
movq XX,X movq XX,X
@@ -2463,21 +2467,23 @@
cmpq Y, BUFFER cmpq Y, BUFFER
je .L999 je .L999
#endif #endif

movq M, TMP_M
movq Y, Y1
cmpq $SIZE, INCY cmpq $SIZE, INCY
jne .L950 jne .L950


testq $SIZE, Y
testq $SIZE, Y1
je .L910 je .L910


movsd (Y), %xmm0
movsd (Y1), %xmm0
addsd (BUFFER), %xmm0 addsd (BUFFER), %xmm0
movsd %xmm0, (Y)
movsd %xmm0, (Y1)


addq $SIZE, Y
addq $SIZE, Y1
addq $SIZE, BUFFER addq $SIZE, BUFFER


decq M
decq TMP_M
jle .L999 jle .L999
ALIGN_4 ALIGN_4


@@ -2485,20 +2491,20 @@
testq $SIZE, BUFFER testq $SIZE, BUFFER
jne .L920 jne .L920


movq M, %rax
movq TMP_M, %rax
sarq $3, %rax sarq $3, %rax
jle .L914 jle .L914
ALIGN_3 ALIGN_3


.L912: .L912:
#ifdef PREFETCHW #ifdef PREFETCHW
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y)
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1)
#endif #endif


movapd 0 * SIZE(Y), %xmm0
movapd 2 * SIZE(Y), %xmm1
movapd 4 * SIZE(Y), %xmm2
movapd 6 * SIZE(Y), %xmm3
movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y1), %xmm1
movapd 4 * SIZE(Y1), %xmm2
movapd 6 * SIZE(Y1), %xmm3


movapd 0 * SIZE(BUFFER), %xmm4 movapd 0 * SIZE(BUFFER), %xmm4
movapd 2 * SIZE(BUFFER), %xmm5 movapd 2 * SIZE(BUFFER), %xmm5
@@ -2514,12 +2520,12 @@
addpd %xmm6, %xmm2 addpd %xmm6, %xmm2
addpd %xmm7, %xmm3 addpd %xmm7, %xmm3


movapd %xmm0, 0 * SIZE(Y)
movapd %xmm1, 2 * SIZE(Y)
movapd %xmm2, 4 * SIZE(Y)
movapd %xmm3, 6 * SIZE(Y)
movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y1)
movapd %xmm2, 4 * SIZE(Y1)
movapd %xmm3, 6 * SIZE(Y1)


addq $8 * SIZE, Y
addq $8 * SIZE, Y1
addq $8 * SIZE, BUFFER addq $8 * SIZE, BUFFER


decq %rax decq %rax
@@ -2527,14 +2533,14 @@
ALIGN_3 ALIGN_3


.L914: .L914:
testq $7, M
testq $7, TMP_M
jle .L999 jle .L999


testq $4, M
testq $4, TMP_M
jle .L915 jle .L915


movapd 0 * SIZE(Y), %xmm0
movapd 2 * SIZE(Y), %xmm1
movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y1), %xmm1


movapd 0 * SIZE(BUFFER), %xmm4 movapd 0 * SIZE(BUFFER), %xmm4
movapd 2 * SIZE(BUFFER), %xmm5 movapd 2 * SIZE(BUFFER), %xmm5
@@ -2542,40 +2548,40 @@
addpd %xmm4, %xmm0 addpd %xmm4, %xmm0
addpd %xmm5, %xmm1 addpd %xmm5, %xmm1


movapd %xmm0, 0 * SIZE(Y)
movapd %xmm1, 2 * SIZE(Y)
movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y1)


addq $4 * SIZE, Y
addq $4 * SIZE, Y1
addq $4 * SIZE, BUFFER addq $4 * SIZE, BUFFER
ALIGN_3 ALIGN_3


.L915: .L915:
testq $2, M
testq $2, TMP_M
jle .L916 jle .L916


movapd (Y), %xmm0
movapd (Y1), %xmm0


movapd (BUFFER), %xmm4 movapd (BUFFER), %xmm4


addpd %xmm4, %xmm0 addpd %xmm4, %xmm0


movapd %xmm0, (Y)
movapd %xmm0, (Y1)


addq $2 * SIZE, Y
addq $2 * SIZE, Y1
addq $2 * SIZE, BUFFER addq $2 * SIZE, BUFFER
ALIGN_3 ALIGN_3


.L916: .L916:
testq $1, M
testq $1, TMP_M
jle .L999 jle .L999


movsd (Y), %xmm0
movsd (Y1), %xmm0


movsd 0 * SIZE(BUFFER), %xmm4 movsd 0 * SIZE(BUFFER), %xmm4


addsd %xmm4, %xmm0 addsd %xmm4, %xmm0


movlpd %xmm0, (Y)
movlpd %xmm0, (Y1)
ALIGN_3 ALIGN_3


jmp .L999 jmp .L999
@@ -2584,20 +2590,20 @@
.L920: .L920:
movapd -1 * SIZE(BUFFER), %xmm4 movapd -1 * SIZE(BUFFER), %xmm4


movq M, %rax
movq TMP_M, %rax
sarq $3, %rax sarq $3, %rax
jle .L924 jle .L924
ALIGN_3 ALIGN_3


.L922: .L922:
#ifdef PREFETCHW #ifdef PREFETCHW
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y)
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1)
#endif #endif


movapd 0 * SIZE(Y), %xmm0
movapd 2 * SIZE(Y), %xmm1
movapd 4 * SIZE(Y), %xmm2
movapd 6 * SIZE(Y), %xmm3
movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y1), %xmm1
movapd 4 * SIZE(Y1), %xmm2
movapd 6 * SIZE(Y1), %xmm3


movapd 1 * SIZE(BUFFER), %xmm5 movapd 1 * SIZE(BUFFER), %xmm5
movapd 3 * SIZE(BUFFER), %xmm6 movapd 3 * SIZE(BUFFER), %xmm6
@@ -2618,14 +2624,14 @@
addpd %xmm6, %xmm2 addpd %xmm6, %xmm2
addpd %xmm7, %xmm3 addpd %xmm7, %xmm3


movapd %xmm0, 0 * SIZE(Y)
movapd %xmm1, 2 * SIZE(Y)
movapd %xmm2, 4 * SIZE(Y)
movapd %xmm3, 6 * SIZE(Y)
movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y1)
movapd %xmm2, 4 * SIZE(Y1)
movapd %xmm3, 6 * SIZE(Y1)


movapd %xmm8, %xmm4 movapd %xmm8, %xmm4


addq $8 * SIZE, Y
addq $8 * SIZE, Y1
addq $8 * SIZE, BUFFER addq $8 * SIZE, BUFFER


decq %rax decq %rax
@@ -2633,14 +2639,14 @@
ALIGN_3 ALIGN_3


.L924: .L924:
testq $7, M
testq $7, TMP_M
jle .L999 jle .L999


testq $4, M
testq $4, TMP_M
jle .L925 jle .L925


movapd 0 * SIZE(Y), %xmm0
movapd 2 * SIZE(Y), %xmm1
movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y1), %xmm1


movapd 1 * SIZE(BUFFER), %xmm5 movapd 1 * SIZE(BUFFER), %xmm5
movapd 3 * SIZE(BUFFER), %xmm6 movapd 3 * SIZE(BUFFER), %xmm6
@@ -2651,20 +2657,20 @@
addpd %xmm4, %xmm0 addpd %xmm4, %xmm0
addpd %xmm5, %xmm1 addpd %xmm5, %xmm1


movapd %xmm0, 0 * SIZE(Y)
movapd %xmm1, 2 * SIZE(Y)
movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y1)


movapd %xmm6, %xmm4 movapd %xmm6, %xmm4


addq $4 * SIZE, Y
addq $4 * SIZE, Y1
addq $4 * SIZE, BUFFER addq $4 * SIZE, BUFFER
ALIGN_3 ALIGN_3


.L925: .L925:
testq $2, M
testq $2, TMP_M
jle .L926 jle .L926


movapd (Y), %xmm0
movapd (Y1), %xmm0


movapd 1 * SIZE(BUFFER), %xmm5 movapd 1 * SIZE(BUFFER), %xmm5


@@ -2672,25 +2678,25 @@


addpd %xmm4, %xmm0 addpd %xmm4, %xmm0


movapd %xmm0, (Y)
movapd %xmm0, (Y1)


movaps %xmm5, %xmm4 movaps %xmm5, %xmm4


addq $2 * SIZE, Y
addq $2 * SIZE, Y1
addq $2 * SIZE, BUFFER addq $2 * SIZE, BUFFER
ALIGN_3 ALIGN_3


.L926: .L926:
testq $1, M
testq $1, TMP_M
jle .L999 jle .L999


movsd (Y), %xmm0
movsd (Y1), %xmm0


shufpd $1, %xmm4, %xmm4 shufpd $1, %xmm4, %xmm4


addsd %xmm4, %xmm0 addsd %xmm4, %xmm0


movlpd %xmm0, (Y)
movlpd %xmm0, (Y1)
ALIGN_3 ALIGN_3


jmp .L999 jmp .L999
@@ -2700,53 +2706,53 @@
testq $SIZE, BUFFER testq $SIZE, BUFFER
je .L960 je .L960


movsd (Y), %xmm0
movsd (Y1), %xmm0
addsd (BUFFER), %xmm0 addsd (BUFFER), %xmm0
movsd %xmm0, (Y)
movsd %xmm0, (Y1)


addq INCY, Y
addq INCY, Y1
addq $SIZE, BUFFER addq $SIZE, BUFFER


decq M
decq TMP_M
jle .L999 jle .L999
ALIGN_4 ALIGN_4


.L960: .L960:
movq Y, Y1
movq Y1, Y2


movq M, %rax
movq TMP_M, %rax
sarq $3, %rax sarq $3, %rax
jle .L964 jle .L964
ALIGN_3 ALIGN_3


.L962: .L962:
movsd (Y), %xmm0
addq INCY, Y
movhpd (Y), %xmm0
addq INCY, Y
movsd (Y2), %xmm0
addq INCY, Y2
movhpd (Y2), %xmm0
addq INCY, Y2


movapd 0 * SIZE(BUFFER), %xmm4 movapd 0 * SIZE(BUFFER), %xmm4


movsd (Y), %xmm1
addq INCY, Y
movhpd (Y), %xmm1
addq INCY, Y
movsd (Y2), %xmm1
addq INCY, Y2
movhpd (Y2), %xmm1
addq INCY, Y2


movapd 2 * SIZE(BUFFER), %xmm5 movapd 2 * SIZE(BUFFER), %xmm5


movsd (Y), %xmm2
addq INCY, Y
movhpd (Y), %xmm2
addq INCY, Y
movsd (Y2), %xmm2
addq INCY, Y2
movhpd (Y2), %xmm2
addq INCY, Y2


movapd 4 * SIZE(BUFFER), %xmm6 movapd 4 * SIZE(BUFFER), %xmm6


addpd %xmm4, %xmm0 addpd %xmm4, %xmm0


movsd (Y), %xmm3
addq INCY, Y
movhpd (Y), %xmm3
addq INCY, Y
movsd (Y2), %xmm3
addq INCY, Y2
movhpd (Y2), %xmm3
addq INCY, Y2


movapd 6 * SIZE(BUFFER), %xmm7 movapd 6 * SIZE(BUFFER), %xmm7


@@ -2781,23 +2787,23 @@
ALIGN_3 ALIGN_3


.L964: .L964:
testq $7, M
testq $7, TMP_M
jle .L999 jle .L999


testq $4, M
testq $4, TMP_M
jle .L965 jle .L965


movsd (Y), %xmm0
addq INCY, Y
movhpd (Y), %xmm0
addq INCY, Y
movsd (Y2), %xmm0
addq INCY, Y2
movhpd (Y2), %xmm0
addq INCY, Y2


movapd 0 * SIZE(BUFFER), %xmm4 movapd 0 * SIZE(BUFFER), %xmm4


movsd (Y), %xmm1
addq INCY, Y
movhpd (Y), %xmm1
addq INCY, Y
movsd (Y2), %xmm1
addq INCY, Y2
movhpd (Y2), %xmm1
addq INCY, Y2


movapd 2 * SIZE(BUFFER), %xmm5 movapd 2 * SIZE(BUFFER), %xmm5


@@ -2817,13 +2823,13 @@
ALIGN_3 ALIGN_3


.L965: .L965:
testq $2, M
testq $2, TMP_M
jle .L966 jle .L966


movsd (Y), %xmm0
addq INCY, Y
movhpd (Y), %xmm0
addq INCY, Y
movsd (Y2), %xmm0
addq INCY, Y2
movhpd (Y2), %xmm0
addq INCY, Y2


movapd 0 * SIZE(BUFFER), %xmm4 movapd 0 * SIZE(BUFFER), %xmm4


@@ -2838,10 +2844,10 @@
ALIGN_3 ALIGN_3


.L966: .L966:
testq $1, M
testq $1, TMP_M
jle .L999 jle .L999


movsd (Y), %xmm0
movsd (Y2), %xmm0


movsd 0 * SIZE(BUFFER), %xmm4 movsd 0 * SIZE(BUFFER), %xmm4


@@ -2853,6 +2859,9 @@
.L999: .L999:
leaq (, M, SIZE), %rax leaq (, M, SIZE), %rax
addq %rax,AA addq %rax,AA
movq STACK_INCY, INCY
imulq INCY, %rax
addq %rax, Y
jmp .L0t jmp .L0t
ALIGN_4 ALIGN_4




Loading…
Cancel
Save