From d1d69e1b9ac20866a10170e49c3de2bdae8676d9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:09:24 +0200 Subject: [PATCH 01/20] Add read barrier definition --- common_alpha.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_alpha.h b/common_alpha.h index 9739c941d..f1ea8ff94 100644 --- a/common_alpha.h +++ b/common_alpha.h @@ -43,6 +43,7 @@ #define MB asm("mb") #define WMB asm("wmb") +#define RMB asm("rmb") static void __inline blas_lock(unsigned long *address){ #ifndef __DECC From 8692456226b084333c8708b2887de22435cf3166 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:10:37 +0200 Subject: [PATCH 02/20] Add read barrier definition --- common_arm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common_arm.h b/common_arm.h index 8411e6dd6..ee691ad75 100644 --- a/common_arm.h +++ b/common_arm.h @@ -37,11 +37,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MB #define WMB +#define RMB #else #define MB __asm__ __volatile__ ("dmb ish" : : : "memory") #define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") +#define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory") #endif From d237dc13601743dc9cb584d60a02ccdc797df3cf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:11:58 +0200 Subject: [PATCH 03/20] Add read barrier definition --- common_arm64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_arm64.h b/common_arm64.h index 99e0cee57..314946282 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MB __asm__ __volatile__ ("dmb ish" : : : "memory") #define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") - +#define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory") #define INLINE inline From 25e879fe92d598e9535e48cba18ec65c1e7d5211 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:12:54 +0200 Subject: [PATCH 04/20] Add (empty) read barrier definition --- common_ia64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_ia64.h b/common_ia64.h index 72b75fc4e..59aefbd6d 100644 --- a/common_ia64.h +++ b/common_ia64.h @@ -47,6 +47,7 @@ #define MB #define WMB +#define RMB #ifdef __ECC #include From ee6b3df02ca8594271417fc63029a898ec86feb7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:14:06 +0200 Subject: [PATCH 05/20] Add read barrier definition --- common_mips.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_mips.h b/common_mips.h index 35bff5083..2cc923043 100644 --- a/common_mips.h +++ b/common_mips.h @@ -35,6 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MB __sync_synchronize() #define WMB __sync_synchronize() +#define RMB __sync_synchronize() #define INLINE inline From 99dde1d2c9629324ceabb6e744d0f4845089e24f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:14:58 +0200 Subject: [PATCH 06/20] Add read barrier definition --- common_mips64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_mips64.h b/common_mips64.h index 1163413dc..af638d60c 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -73,6 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MB __sync_synchronize() #define WMB __sync_synchronize() +#define RMB __sync_synchronize() #define INLINE inline From 3d4db4d002afbd8ee970a5de840e075ccbae626a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:16:44 +0200 Subject: [PATCH 07/20] Add read barrier definition --- common_power.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common_power.h b/common_power.h index e7caf9adf..e29d0f382 100644 --- a/common_power.h +++ b/common_power.h @@ -71,9 +71,11 @@ #if defined(POWER8) || defined(POWER9) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") +#define RMB __asm__ __volatile__ ("eieio":::"memory") #else #define MB __asm__ __volatile__ ("sync") #define WMB __asm__ __volatile__ ("sync") +#define RMB __asm__ __volatile__ ("sync") #endif #define INLINE inline From 69b6e258d8d6fe6211a86b987204c350f4f62deb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:17:41 +0200 Subject: [PATCH 08/20] Add (empty) read barrier definition --- common_sparc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_sparc.h b/common_sparc.h index f99972db9..85e29fffa 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -41,6 +41,7 @@ #define MB __asm__ __volatile__ ("nop") #define WMB __asm__ __volatile__ ("nop") +#define RMB __asm__ __volatile__ ("nop") #ifndef ASSEMBLER From db3226a64681173d9d785cd71153a110b2b2dcee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:18:48 +0200 Subject: [PATCH 09/20] Add (empty) read barrier definition --- common_x86.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_x86.h b/common_x86.h index 99adc9f5b..ec928e236 100644 --- a/common_x86.h +++ b/common_x86.h @@ -47,6 +47,7 @@ #define MB #define WMB +#define RMB #ifdef C_SUN #define __asm__ __asm From a52bdd9d7b1b3e24d1eff9e52020c05cef6602dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:22:35 +0200 Subject: [PATCH 10/20] Add (empty) read barrier definition --- common_x86_64.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common_x86_64.h b/common_x86_64.h index 958e9caed..0247674cd 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -63,13 +63,16 @@ #ifdef __GNUC__ #define MB do { __asm__ __volatile__("": : :"memory"); } while (0) #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) +#define RMB #else #define MB do {} while (0) #define WMB do {} while (0) +#define RMB #endif static void __inline blas_lock(volatile BLASULONG *address){ + #ifndef C_MSVC int ret; #else From f5efecb7caf9bd438eeeb3b53ebf96f9d8c38b61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:24:10 +0200 Subject: [PATCH 11/20] Add (empty) read barrier definition --- common_zarch.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common_zarch.h b/common_zarch.h index b5503a7a4..442bae821 100644 --- a/common_zarch.h +++ b/common_zarch.h @@ -34,9 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define COMMON_ZARCH #define MB -//__asm__ __volatile__ ("dmb ish" : : : "memory") #define WMB -//__asm__ __volatile__ ("dmb ishst" : : : "memory") +#define RMB #define INLINE inline From f41600e66fef4481ab82fbbad89144a8a8cc0599 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:34:02 +0200 Subject: [PATCH 12/20] Add a read barrier in the traversing of the buffer list Needed on systems with weak memory ordering - the inferior, partially working fix from #2544 was already removed in #2551 --- driver/others/memory.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index a49fb1fa1..5abcbf3a4 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2741,6 +2741,7 @@ void *blas_memory_alloc(int procpos){ LOCK_COMMAND(&alloc_lock); #endif do { + RMB; #if defined(USE_OPENMP) if (!memory[position].used) { blas_lock(&memory[position].lock); From 5b0093b5fe21dbdea04e37a6b3f687282b7313fb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 14:58:52 +0200 Subject: [PATCH 13/20] Convert aligned moves to unaligned should have no performance impact on reasonably modern cpus and fixes occasional crashes in actual user code. --- kernel/x86_64/copy_sse2.S | 186 +++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/kernel/x86_64/copy_sse2.S b/kernel/x86_64/copy_sse2.S index 200daafd9..a5ab2ea91 100644 --- a/kernel/x86_64/copy_sse2.S +++ b/kernel/x86_64/copy_sse2.S @@ -54,7 +54,7 @@ #ifdef OPTERON #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG #else -#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#define LOAD(OFFSET, ADDR, REG) movups OFFSET(ADDR), REG #endif PROLOGUE @@ -104,14 +104,14 @@ sarq $4, %rax jle .L13 - movaps -16 * SIZE(X), %xmm0 - movaps -14 * SIZE(X), %xmm1 - movaps -12 * SIZE(X), %xmm2 - movaps -10 * SIZE(X), %xmm3 - movaps -8 * SIZE(X), %xmm4 - movaps -6 * SIZE(X), %xmm5 - movaps -4 * SIZE(X), %xmm6 - movaps -2 * SIZE(X), %xmm7 + movups -16 * SIZE(X), %xmm0 + movups -14 * SIZE(X), %xmm1 + movups -12 * SIZE(X), %xmm2 + movups -10 * SIZE(X), %xmm3 + movups -8 * SIZE(X), %xmm4 + movups -6 * SIZE(X), %xmm5 + movups -4 * SIZE(X), %xmm6 + movups -2 * SIZE(X), %xmm7 decq %rax jle .L12 @@ -122,36 +122,36 @@ PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif - movaps %xmm0, -16 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) - movaps %xmm1, -14 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) LOAD( 2 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif - movaps %xmm2, -12 * SIZE(Y) + movups %xmm2, -12 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm2) - movaps %xmm3, -10 * SIZE(Y) + movups %xmm3, -10 * SIZE(Y) LOAD( 6 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif - movaps %xmm4, -8 * SIZE(Y) + movups %xmm4, -8 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm4) - movaps %xmm5, -6 * SIZE(Y) + movups %xmm5, -6 * SIZE(Y) LOAD(10 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif - movaps %xmm6, -4 * SIZE(Y) + movups %xmm6, -4 * SIZE(Y) LOAD(12 * SIZE, X, %xmm6) - movaps %xmm7, -2 * SIZE(Y) + movups %xmm7, -2 * SIZE(Y) LOAD(14 * SIZE, X, %xmm7) subq $-16 * SIZE, Y @@ -161,14 +161,14 @@ ALIGN_3 .L12: - movaps %xmm0, -16 * SIZE(Y) - movaps %xmm1, -14 * SIZE(Y) - movaps %xmm2, -12 * SIZE(Y) - movaps %xmm3, -10 * SIZE(Y) - movaps %xmm4, -8 * SIZE(Y) - movaps %xmm5, -6 * SIZE(Y) - movaps %xmm6, -4 * SIZE(Y) - movaps %xmm7, -2 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) + movups %xmm2, -12 * SIZE(Y) + movups %xmm3, -10 * SIZE(Y) + movups %xmm4, -8 * SIZE(Y) + movups %xmm5, -6 * SIZE(Y) + movups %xmm6, -4 * SIZE(Y) + movups %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X @@ -179,15 +179,15 @@ jle .L14 ALIGN_3 - movaps -16 * SIZE(X), %xmm0 - movaps -14 * SIZE(X), %xmm1 - movaps -12 * SIZE(X), %xmm2 - movaps -10 * SIZE(X), %xmm3 + movups -16 * SIZE(X), %xmm0 + movups -14 * SIZE(X), %xmm1 + movups -12 * SIZE(X), %xmm2 + movups -10 * SIZE(X), %xmm3 - movaps %xmm0, -16 * SIZE(Y) - movaps %xmm1, -14 * SIZE(Y) - movaps %xmm2, -12 * SIZE(Y) - movaps %xmm3, -10 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) + movups %xmm2, -12 * SIZE(Y) + movups %xmm3, -10 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y @@ -198,11 +198,11 @@ jle .L15 ALIGN_3 - movaps -16 * SIZE(X), %xmm0 - movaps -14 * SIZE(X), %xmm1 + movups -16 * SIZE(X), %xmm0 + movups -14 * SIZE(X), %xmm1 - movaps %xmm0, -16 * SIZE(Y) - movaps %xmm1, -14 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y @@ -213,8 +213,8 @@ jle .L16 ALIGN_3 - movaps -16 * SIZE(X), %xmm0 - movaps %xmm0, -16 * SIZE(Y) + movups -16 * SIZE(X), %xmm0 + movups %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y @@ -246,13 +246,13 @@ sarq $4, %rax jle .L23 - movaps -15 * SIZE(X), %xmm1 - movaps -13 * SIZE(X), %xmm2 - movaps -11 * SIZE(X), %xmm3 - movaps -9 * SIZE(X), %xmm4 - movaps -7 * SIZE(X), %xmm5 - movaps -5 * SIZE(X), %xmm6 - movaps -3 * SIZE(X), %xmm7 + movups -15 * SIZE(X), %xmm1 + movups -13 * SIZE(X), %xmm2 + movups -11 * SIZE(X), %xmm3 + movups -9 * SIZE(X), %xmm4 + movups -7 * SIZE(X), %xmm5 + movups -5 * SIZE(X), %xmm6 + movups -3 * SIZE(X), %xmm7 decq %rax jle .L22 @@ -264,11 +264,11 @@ #endif SHUFPD_1 %xmm1, %xmm0 - movaps %xmm0, -16 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) LOAD(-1 * SIZE, X, %xmm0) SHUFPD_1 %xmm2, %xmm1 - movaps %xmm1, -14 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) LOAD( 1 * SIZE, X, %xmm1) #ifdef PREFETCH @@ -276,11 +276,11 @@ #endif SHUFPD_1 %xmm3, %xmm2 - movaps %xmm2, -12 * SIZE(Y) + movups %xmm2, -12 * SIZE(Y) LOAD( 3 * SIZE, X, %xmm2) SHUFPD_1 %xmm4, %xmm3 - movaps %xmm3, -10 * SIZE(Y) + movups %xmm3, -10 * SIZE(Y) LOAD( 5 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) @@ -288,11 +288,11 @@ #endif SHUFPD_1 %xmm5, %xmm4 - movaps %xmm4, -8 * SIZE(Y) + movups %xmm4, -8 * SIZE(Y) LOAD( 7 * SIZE, X, %xmm4) SHUFPD_1 %xmm6, %xmm5 - movaps %xmm5, -6 * SIZE(Y) + movups %xmm5, -6 * SIZE(Y) LOAD( 9 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) @@ -300,11 +300,11 @@ #endif SHUFPD_1 %xmm7, %xmm6 - movaps %xmm6, -4 * SIZE(Y) + movups %xmm6, -4 * SIZE(Y) LOAD(11 * SIZE, X, %xmm6) SHUFPD_1 %xmm0, %xmm7 - movaps %xmm7, -2 * SIZE(Y) + movups %xmm7, -2 * SIZE(Y) LOAD(13 * SIZE, X, %xmm7) subq $-16 * SIZE, X @@ -315,26 +315,26 @@ .L22: SHUFPD_1 %xmm1, %xmm0 - movaps %xmm0, -16 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) LOAD(-1 * SIZE, X, %xmm0) SHUFPD_1 %xmm2, %xmm1 - movaps %xmm1, -14 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 - movaps %xmm2, -12 * SIZE(Y) + movups %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 - movaps %xmm3, -10 * SIZE(Y) + movups %xmm3, -10 * SIZE(Y) SHUFPD_1 %xmm5, %xmm4 - movaps %xmm4, -8 * SIZE(Y) + movups %xmm4, -8 * SIZE(Y) SHUFPD_1 %xmm6, %xmm5 - movaps %xmm5, -6 * SIZE(Y) + movups %xmm5, -6 * SIZE(Y) SHUFPD_1 %xmm7, %xmm6 - movaps %xmm6, -4 * SIZE(Y) + movups %xmm6, -4 * SIZE(Y) SHUFPD_1 %xmm0, %xmm7 - movaps %xmm7, -2 * SIZE(Y) + movups %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y @@ -345,24 +345,24 @@ jle .L24 ALIGN_3 - movaps -15 * SIZE(X), %xmm1 - movaps -13 * SIZE(X), %xmm2 - movaps -11 * SIZE(X), %xmm3 - movaps -9 * SIZE(X), %xmm8 + movups -15 * SIZE(X), %xmm1 + movups -13 * SIZE(X), %xmm2 + movups -11 * SIZE(X), %xmm3 + movups -9 * SIZE(X), %xmm8 SHUFPD_1 %xmm1, %xmm0 - movaps %xmm0, -16 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm2, %xmm1 - movaps %xmm1, -14 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 - movaps %xmm2, -12 * SIZE(Y) + movups %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm8, %xmm3 - movaps %xmm3, -10 * SIZE(Y) + movups %xmm3, -10 * SIZE(Y) - movaps %xmm8, %xmm0 + movups %xmm8, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y @@ -373,15 +373,15 @@ jle .L25 ALIGN_3 - movaps -15 * SIZE(X), %xmm1 - movaps -13 * SIZE(X), %xmm2 + movups -15 * SIZE(X), %xmm1 + movups -13 * SIZE(X), %xmm2 SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm2, %xmm1 - movaps %xmm0, -16 * SIZE(Y) - movaps %xmm1, -14 * SIZE(Y) - movaps %xmm2, %xmm0 + movups %xmm0, -16 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) + movups %xmm2, %xmm0 addq $4 * SIZE, X addq $4 * SIZE, Y @@ -392,10 +392,10 @@ jle .L26 ALIGN_3 - movaps -15 * SIZE(X), %xmm1 + movups -15 * SIZE(X), %xmm1 SHUFPD_1 %xmm1, %xmm0 - movaps %xmm0, -16 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y @@ -424,14 +424,14 @@ sarq $4, %rax jle .L23 - movaps -16 * SIZE(X), %xmm0 - movaps -14 * SIZE(X), %xmm1 - movaps -12 * SIZE(X), %xmm2 - movaps -10 * SIZE(X), %xmm3 - movaps -8 * SIZE(X), %xmm4 - movaps -6 * SIZE(X), %xmm5 - movaps -4 * SIZE(X), %xmm6 - movaps -2 * SIZE(X), %xmm7 + movups -16 * SIZE(X), %xmm0 + movups -14 * SIZE(X), %xmm1 + movups -12 * SIZE(X), %xmm2 + movups -10 * SIZE(X), %xmm3 + movups -8 * SIZE(X), %xmm4 + movups -6 * SIZE(X), %xmm5 + movups -4 * SIZE(X), %xmm6 + movups -2 * SIZE(X), %xmm7 decq %rax jle .L22 @@ -515,16 +515,16 @@ jle .L24 ALIGN_3 - movaps -16 * SIZE(X), %xmm0 + movups -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) - movaps -14 * SIZE(X), %xmm1 + movups -14 * SIZE(X), %xmm1 movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) - movaps -12 * SIZE(X), %xmm2 + movups -12 * SIZE(X), %xmm2 movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) - movaps -10 * SIZE(X), %xmm3 + movups -10 * SIZE(X), %xmm3 movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) @@ -537,10 +537,10 @@ jle .L25 ALIGN_3 - movaps -16 * SIZE(X), %xmm0 + movups -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) - movaps -14 * SIZE(X), %xmm1 + movups -14 * SIZE(X), %xmm1 movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) @@ -553,7 +553,7 @@ jle .L26 ALIGN_3 - movaps -16 * SIZE(X), %xmm0 + movups -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) From 3eec7d382c72a47df6ff687a7994f6f04b0c064d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 15:56:31 +0200 Subject: [PATCH 14/20] ARMV7 does not support DMB ISHLD, use DMB ISH --- common_arm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_arm.h b/common_arm.h index ee691ad75..682315de5 100644 --- a/common_arm.h +++ b/common_arm.h @@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MB __asm__ __volatile__ ("dmb ish" : : : "memory") #define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") -#define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory") +#define RMB __asm__ __volatile__ ("dmb ish" : : : "memory") #endif From 0f08f3efa62558e104ae3e71e0470c5cd286a1d2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 22:46:12 +0200 Subject: [PATCH 15/20] Add a multithread test for x86_64 --- .drone.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/.drone.yml b/.drone.yml index 3bbd8fc88..300cf3254 100644 --- a/.drone.yml +++ b/.drone.yml @@ -166,3 +166,27 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester +--- +kind: pipeline +name: epyc_native_test + +platform: + os: linux + arch: amd64 + +steps: +- name: Build and Test + image: ubuntu:19.04 + environment: + CC: gcc + COMMON_FLAGS: 'USE_OPENMP=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + - make -C cpp_thread_test dgemm_tester From b969533703cc745f04e4fc99e7e80d181e7f24f1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Apr 2020 10:53:28 +0200 Subject: [PATCH 16/20] Add drone.io badge, mention EMAG8180 support, reformat the DYNAMIC_ARCH paragraph --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 04f43f4c7..61393bd8f 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,11 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) +Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/) + [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) + ## Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. @@ -140,6 +143,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **ThunderX**: Optimized some Level-1 functions - **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2 - **TSV110**: Optimized some Level-3 helper functions +- **EMAG 8180**: preliminary support based on A57 #### PPC/PPC64 @@ -154,11 +158,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake. + For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default. + DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano. + On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. + For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14. + The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. From 84a9614345d0030275230083fe4bc38e4531652d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Apr 2020 19:18:35 +0200 Subject: [PATCH 17/20] try x86_64 test without openmp --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index 300cf3254..5686c5e41 100644 --- a/.drone.yml +++ b/.drone.yml @@ -179,7 +179,7 @@ steps: image: ubuntu:19.04 environment: CC: gcc - COMMON_FLAGS: 'USE_OPENMP=1' + COMMON_FLAGS: 'USE_THREAD=1' commands: - echo "MAKE_FLAGS:= $COMMON_FLAGS" - apt-get update -y From 579811fb6ae33e9b82b970300e1a1481985b6105 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Apr 2020 17:38:33 +0200 Subject: [PATCH 18/20] Move all 19.04-based jobs back to ubuntu 18.04 --- .drone.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.drone.yml b/.drone.yml index 5686c5e41..8b7ac3011 100644 --- a/.drone.yml +++ b/.drone.yml @@ -8,7 +8,7 @@ platform: steps: - name: Build and Test - image: ubuntu:19.04 + image: ubuntu:18.04 environment: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' @@ -32,7 +32,7 @@ platform: steps: - name: Build and Test - image: ubuntu:19.04 + image: ubuntu:18.04 environment: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32' @@ -152,7 +152,7 @@ platform: steps: - name: Build and Test - image: ubuntu:19.04 + image: ubuntu:18.04 environment: CC: gcc COMMON_FLAGS: 'USE_OPENMP=1' @@ -176,7 +176,7 @@ platform: steps: - name: Build and Test - image: ubuntu:19.04 + image: ubuntu:18.04 environment: CC: gcc COMMON_FLAGS: 'USE_THREAD=1' From e8e8a6e60802596d1d9a037062ac40f4b1cad356 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Apr 2020 19:26:12 +0200 Subject: [PATCH 19/20] Restore USE_OPENMP in the x86 thread test --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index 8b7ac3011..b1c211d14 100644 --- a/.drone.yml +++ b/.drone.yml @@ -179,7 +179,7 @@ steps: image: ubuntu:18.04 environment: CC: gcc - COMMON_FLAGS: 'USE_THREAD=1' + COMMON_FLAGS: 'USE_OPENMP=1' commands: - echo "MAKE_FLAGS:= $COMMON_FLAGS" - apt-get update -y From 6b7ef6543a998ea9f1063873d04469503be38766 Mon Sep 17 00:00:00 2001 From: l00536773 Date: Thu, 16 Apr 2020 10:55:10 +0800 Subject: [PATCH 20/20] [OpenBLAS]: benchmark error of potrf [description]: when the matrix size goes higher than 5800 during the cpotrf test, error info, such as "Potrf info = 5679", will be returned on ARM64 and x86 machines. Uplo = L & F. [solution]: changed the func for building the matrix so that the complex Hermitian matrix can stay positive definite during the computation. [dts]: --- benchmark/potrf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/potrf.c b/benchmark/potrf.c index 580e46072..cb4c23bab 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -193,14 +193,14 @@ int main(int argc, char *argv[]){ a[((long)j + (long)j * (long)m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { - a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 0] = 0; a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { - a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 0] = 0.; a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; }