Browse Source

Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop

tags/v0.2.9.rc2^2
Zhang Xianyi 12 years ago
parent
commit
dd2d3e61ab
7 changed files with 144 additions and 107 deletions
  1. +14
    -0
      driver/others/blas_server.c
  2. +5
    -0
      driver/others/blas_server_omp.c
  3. +5
    -0
      driver/others/blas_server_win32.c
  4. +5
    -1
      driver/others/memory.c
  5. +4
    -4
      kernel/arm/KERNEL.ARMV6
  6. +6
    -6
      kernel/arm/KERNEL.ARMV7
  7. +105
    -96
      kernel/x86_64/dgemv_n.S

+ 14
- 0
driver/others/blas_server.c View File

@@ -83,6 +83,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#define ATTRIBUTE_SIZE 128

extern void openblas_warning(int verbose, const char * msg);

/* This is a thread server model implementation. The threads are */
/* spawned at first access to blas library, and still remains until */
/* destruction routine is called. The number of threads are */
@@ -921,5 +923,17 @@ int BLASFUNC(blas_thread_shutdown)(void){
return 0;
}

/*
https://github.com/xianyi/OpenBLAS/issues/294
Use pthread_atfork to close blas_thread_server before fork.
Then, re-init blas_thread_server after fork at child and parent.
*/
void openblas_fork_handler()
{
int err;
err = pthread_atfork (BLASFUNC(blas_thread_shutdown), blas_thread_init, blas_thread_init);
if(err != 0)
openblas_warning(0, "OpenBLAS cannot install fork handler. You may meet hang after fork.\n");
}
#endif


+ 5
- 0
driver/others/blas_server_omp.c View File

@@ -315,4 +315,9 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
return 0;
}

void openblas_fork_handler()
{

}

#endif

+ 5
- 0
driver/others/blas_server_win32.c View File

@@ -498,3 +498,8 @@ void openblas_set_num_threads(int num)
{
goto_set_num_threads(num);
}

void openblas_fork_handler()
{

}

+ 5
- 1
driver/others/memory.c View File

@@ -1288,7 +1288,11 @@ void CONSTRUCTOR gotoblas_init(void) {
#ifdef SMP
if (blas_cpu_number == 0) blas_get_cpu_number();
#ifdef SMP_SERVER
if (blas_server_avail == 0) blas_thread_init();
if (blas_server_avail == 0) {
blas_thread_init();
//deal with pthread and fork.
openblas_fork_handler();
}
#endif
#endif



+ 4
- 4
kernel/arm/KERNEL.ARMV6 View File

@@ -40,10 +40,10 @@ DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S

SCOPYKERNEL = scopy_vfp.S
DCOPYKERNEL = dcopy_vfp.S
CCOPYKERNEL = ccopy_vfp.S
ZCOPYKERNEL = zcopy_vfp.S
SCOPYKERNEL = copy.c
DCOPYKERNEL = copy.c
CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy.c

SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S


+ 6
- 6
kernel/arm/KERNEL.ARMV7 View File

@@ -45,10 +45,10 @@ DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S

SCOPYKERNEL = scopy_vfp.S
DCOPYKERNEL = dcopy_vfp.S
CCOPYKERNEL = ccopy_vfp.S
ZCOPYKERNEL = zcopy_vfp.S
SCOPYKERNEL = copy.c
DCOPYKERNEL = copy.c
CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy.c

SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S
@@ -66,12 +66,12 @@ CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S

SSCALKERNEL = scal_vfp.S
DSCALKERNEL = scal_vfp.S
DSCALKERNEL = scal.c
CSCALKERNEL = scal_vfp.S
ZSCALKERNEL = scal_vfp.S

SGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n.c
CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S



+ 105
- 96
kernel/x86_64/dgemv_n.S View File

@@ -111,6 +111,9 @@
#define MM M
#endif

#define TMP_M %r15
#define Y2 %rbx
PROLOGUE
PROFCODE

@@ -170,8 +173,9 @@
jge .L00t

movq MMM,M
addq I,M
addq M, I
jle .L999x
movq I, M
.L00t:
movq XX,X
@@ -2463,21 +2467,23 @@
cmpq Y, BUFFER
je .L999
#endif

movq M, TMP_M
movq Y, Y1
cmpq $SIZE, INCY
jne .L950

testq $SIZE, Y
testq $SIZE, Y1
je .L910

movsd (Y), %xmm0
movsd (Y1), %xmm0
addsd (BUFFER), %xmm0
movsd %xmm0, (Y)
movsd %xmm0, (Y1)

addq $SIZE, Y
addq $SIZE, Y1
addq $SIZE, BUFFER

decq M
decq TMP_M
jle .L999
ALIGN_4

@@ -2485,20 +2491,20 @@
testq $SIZE, BUFFER
jne .L920

movq M, %rax
movq TMP_M, %rax
sarq $3, %rax
jle .L914
ALIGN_3

.L912:
#ifdef PREFETCHW
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y)
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1)
#endif

movapd 0 * SIZE(Y), %xmm0
movapd 2 * SIZE(Y), %xmm1
movapd 4 * SIZE(Y), %xmm2
movapd 6 * SIZE(Y), %xmm3
movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y1), %xmm1
movapd 4 * SIZE(Y1), %xmm2
movapd 6 * SIZE(Y1), %xmm3

movapd 0 * SIZE(BUFFER), %xmm4
movapd 2 * SIZE(BUFFER), %xmm5
@@ -2514,12 +2520,12 @@
addpd %xmm6, %xmm2
addpd %xmm7, %xmm3

movapd %xmm0, 0 * SIZE(Y)
movapd %xmm1, 2 * SIZE(Y)
movapd %xmm2, 4 * SIZE(Y)
movapd %xmm3, 6 * SIZE(Y)
movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y1)
movapd %xmm2, 4 * SIZE(Y1)
movapd %xmm3, 6 * SIZE(Y1)

addq $8 * SIZE, Y
addq $8 * SIZE, Y1
addq $8 * SIZE, BUFFER

decq %rax
@@ -2527,14 +2533,14 @@
ALIGN_3

.L914:
testq $7, M
testq $7, TMP_M
jle .L999

testq $4, M
testq $4, TMP_M
jle .L915

movapd 0 * SIZE(Y), %xmm0
movapd 2 * SIZE(Y), %xmm1
movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y1), %xmm1

movapd 0 * SIZE(BUFFER), %xmm4
movapd 2 * SIZE(BUFFER), %xmm5
@@ -2542,40 +2548,40 @@
addpd %xmm4, %xmm0
addpd %xmm5, %xmm1

movapd %xmm0, 0 * SIZE(Y)
movapd %xmm1, 2 * SIZE(Y)
movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y1)

addq $4 * SIZE, Y
addq $4 * SIZE, Y1
addq $4 * SIZE, BUFFER
ALIGN_3

.L915:
testq $2, M
testq $2, TMP_M
jle .L916

movapd (Y), %xmm0
movapd (Y1), %xmm0

movapd (BUFFER), %xmm4

addpd %xmm4, %xmm0

movapd %xmm0, (Y)
movapd %xmm0, (Y1)

addq $2 * SIZE, Y
addq $2 * SIZE, Y1
addq $2 * SIZE, BUFFER
ALIGN_3

.L916:
testq $1, M
testq $1, TMP_M
jle .L999

movsd (Y), %xmm0
movsd (Y1), %xmm0

movsd 0 * SIZE(BUFFER), %xmm4

addsd %xmm4, %xmm0

movlpd %xmm0, (Y)
movlpd %xmm0, (Y1)
ALIGN_3

jmp .L999
@@ -2584,20 +2590,20 @@
.L920:
movapd -1 * SIZE(BUFFER), %xmm4

movq M, %rax
movq TMP_M, %rax
sarq $3, %rax
jle .L924
ALIGN_3

.L922:
#ifdef PREFETCHW
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y)
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1)
#endif

movapd 0 * SIZE(Y), %xmm0
movapd 2 * SIZE(Y), %xmm1
movapd 4 * SIZE(Y), %xmm2
movapd 6 * SIZE(Y), %xmm3
movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y1), %xmm1
movapd 4 * SIZE(Y1), %xmm2
movapd 6 * SIZE(Y1), %xmm3

movapd 1 * SIZE(BUFFER), %xmm5
movapd 3 * SIZE(BUFFER), %xmm6
@@ -2618,14 +2624,14 @@
addpd %xmm6, %xmm2
addpd %xmm7, %xmm3

movapd %xmm0, 0 * SIZE(Y)
movapd %xmm1, 2 * SIZE(Y)
movapd %xmm2, 4 * SIZE(Y)
movapd %xmm3, 6 * SIZE(Y)
movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y1)
movapd %xmm2, 4 * SIZE(Y1)
movapd %xmm3, 6 * SIZE(Y1)

movapd %xmm8, %xmm4

addq $8 * SIZE, Y
addq $8 * SIZE, Y1
addq $8 * SIZE, BUFFER

decq %rax
@@ -2633,14 +2639,14 @@
ALIGN_3

.L924:
testq $7, M
testq $7, TMP_M
jle .L999

testq $4, M
testq $4, TMP_M
jle .L925

movapd 0 * SIZE(Y), %xmm0
movapd 2 * SIZE(Y), %xmm1
movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y1), %xmm1

movapd 1 * SIZE(BUFFER), %xmm5
movapd 3 * SIZE(BUFFER), %xmm6
@@ -2651,20 +2657,20 @@
addpd %xmm4, %xmm0
addpd %xmm5, %xmm1

movapd %xmm0, 0 * SIZE(Y)
movapd %xmm1, 2 * SIZE(Y)
movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y1)

movapd %xmm6, %xmm4

addq $4 * SIZE, Y
addq $4 * SIZE, Y1
addq $4 * SIZE, BUFFER
ALIGN_3

.L925:
testq $2, M
testq $2, TMP_M
jle .L926

movapd (Y), %xmm0
movapd (Y1), %xmm0

movapd 1 * SIZE(BUFFER), %xmm5

@@ -2672,25 +2678,25 @@

addpd %xmm4, %xmm0

movapd %xmm0, (Y)
movapd %xmm0, (Y1)

movaps %xmm5, %xmm4

addq $2 * SIZE, Y
addq $2 * SIZE, Y1
addq $2 * SIZE, BUFFER
ALIGN_3

.L926:
testq $1, M
testq $1, TMP_M
jle .L999

movsd (Y), %xmm0
movsd (Y1), %xmm0

shufpd $1, %xmm4, %xmm4

addsd %xmm4, %xmm0

movlpd %xmm0, (Y)
movlpd %xmm0, (Y1)
ALIGN_3

jmp .L999
@@ -2700,53 +2706,53 @@
testq $SIZE, BUFFER
je .L960

movsd (Y), %xmm0
movsd (Y1), %xmm0
addsd (BUFFER), %xmm0
movsd %xmm0, (Y)
movsd %xmm0, (Y1)

addq INCY, Y
addq INCY, Y1
addq $SIZE, BUFFER

decq M
decq TMP_M
jle .L999
ALIGN_4

.L960:
movq Y, Y1
movq Y1, Y2

movq M, %rax
movq TMP_M, %rax
sarq $3, %rax
jle .L964
ALIGN_3

.L962:
movsd (Y), %xmm0
addq INCY, Y
movhpd (Y), %xmm0
addq INCY, Y
movsd (Y2), %xmm0
addq INCY, Y2
movhpd (Y2), %xmm0
addq INCY, Y2

movapd 0 * SIZE(BUFFER), %xmm4

movsd (Y), %xmm1
addq INCY, Y
movhpd (Y), %xmm1
addq INCY, Y
movsd (Y2), %xmm1
addq INCY, Y2
movhpd (Y2), %xmm1
addq INCY, Y2

movapd 2 * SIZE(BUFFER), %xmm5

movsd (Y), %xmm2
addq INCY, Y
movhpd (Y), %xmm2
addq INCY, Y
movsd (Y2), %xmm2
addq INCY, Y2
movhpd (Y2), %xmm2
addq INCY, Y2

movapd 4 * SIZE(BUFFER), %xmm6

addpd %xmm4, %xmm0

movsd (Y), %xmm3
addq INCY, Y
movhpd (Y), %xmm3
addq INCY, Y
movsd (Y2), %xmm3
addq INCY, Y2
movhpd (Y2), %xmm3
addq INCY, Y2

movapd 6 * SIZE(BUFFER), %xmm7

@@ -2781,23 +2787,23 @@
ALIGN_3

.L964:
testq $7, M
testq $7, TMP_M
jle .L999

testq $4, M
testq $4, TMP_M
jle .L965

movsd (Y), %xmm0
addq INCY, Y
movhpd (Y), %xmm0
addq INCY, Y
movsd (Y2), %xmm0
addq INCY, Y2
movhpd (Y2), %xmm0
addq INCY, Y2

movapd 0 * SIZE(BUFFER), %xmm4

movsd (Y), %xmm1
addq INCY, Y
movhpd (Y), %xmm1
addq INCY, Y
movsd (Y2), %xmm1
addq INCY, Y2
movhpd (Y2), %xmm1
addq INCY, Y2

movapd 2 * SIZE(BUFFER), %xmm5

@@ -2817,13 +2823,13 @@
ALIGN_3

.L965:
testq $2, M
testq $2, TMP_M
jle .L966

movsd (Y), %xmm0
addq INCY, Y
movhpd (Y), %xmm0
addq INCY, Y
movsd (Y2), %xmm0
addq INCY, Y2
movhpd (Y2), %xmm0
addq INCY, Y2

movapd 0 * SIZE(BUFFER), %xmm4

@@ -2838,10 +2844,10 @@
ALIGN_3

.L966:
testq $1, M
testq $1, TMP_M
jle .L999

movsd (Y), %xmm0
movsd (Y2), %xmm0

movsd 0 * SIZE(BUFFER), %xmm4

@@ -2853,6 +2859,9 @@
.L999:
leaq (, M, SIZE), %rax
addq %rax,AA
movq STACK_INCY, INCY
imulq INCY, %rax
addq %rax, Y
jmp .L0t
ALIGN_4



Loading…
Cancel
Save