From d1d69e1b9ac20866a10170e49c3de2bdae8676d9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 12:09:24 +0200
Subject: [PATCH 01/20] Add read barrier definition

---
 common_alpha.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common_alpha.h b/common_alpha.h
index 9739c941d..f1ea8ff94 100644
--- a/common_alpha.h
+++ b/common_alpha.h
@@ -43,6 +43,7 @@
 
 #define MB  asm("mb")
 #define WMB asm("wmb")
+#define RMB asm("rmb")
 
 static void __inline blas_lock(unsigned long *address){
 #ifndef __DECC

From 8692456226b084333c8708b2887de22435cf3166 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 12:10:37 +0200
Subject: [PATCH 02/20] Add read barrier definition

---
 common_arm.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/common_arm.h b/common_arm.h
index 8411e6dd6..ee691ad75 100644
--- a/common_arm.h
+++ b/common_arm.h
@@ -37,11 +37,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define MB
 #define WMB
+#define RMB
 
 #else
 
 #define MB   __asm__ __volatile__ ("dmb  ish" : : : "memory")
 #define WMB  __asm__ __volatile__ ("dmb  ishst" : : : "memory")
+#define RMB  __asm__ __volatile__ ("dmb  ishld" : : : "memory")
 
 #endif
 

From d237dc13601743dc9cb584d60a02ccdc797df3cf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 12:11:58 +0200
Subject: [PATCH 03/20] Add read barrier definition

---
 common_arm64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common_arm64.h b/common_arm64.h
index 99e0cee57..314946282 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define MB   __asm__ __volatile__ ("dmb  ish" : : : "memory")
 #define WMB  __asm__ __volatile__ ("dmb  ishst" : : : "memory")
-
+#define RMB  __asm__ __volatile__ ("dmb  ishld" : : : "memory")
 
 #define INLINE inline
 

From 25e879fe92d598e9535e48cba18ec65c1e7d5211 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 12:12:54 +0200
Subject: [PATCH 04/20] Add (empty) read barrier definition

---
 common_ia64.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common_ia64.h b/common_ia64.h
index 72b75fc4e..59aefbd6d 100644
--- a/common_ia64.h
+++ b/common_ia64.h
@@ -47,6 +47,7 @@
 
 #define MB
 #define WMB
+#define RMB
 
 #ifdef __ECC
 #include <ia64intrin.h>

From ee6b3df02ca8594271417fc63029a898ec86feb7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 12:14:06 +0200
Subject: [PATCH 05/20] Add read barrier definition

---
 common_mips.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common_mips.h b/common_mips.h
index 35bff5083..2cc923043 100644
--- a/common_mips.h
+++ b/common_mips.h
@@ -35,6 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define MB  __sync_synchronize()
 #define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
 
 #define INLINE inline
 

From 99dde1d2c9629324ceabb6e744d0f4845089e24f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 12:14:58 +0200
Subject: [PATCH 06/20] Add read barrier definition

---
 common_mips64.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common_mips64.h b/common_mips64.h
index 1163413dc..af638d60c 100644
--- a/common_mips64.h
+++ b/common_mips64.h
@@ -73,6 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define MB  __sync_synchronize()
 #define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
 
 #define INLINE inline
 

From 3d4db4d002afbd8ee970a5de840e075ccbae626a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 12:16:44 +0200
Subject: [PATCH 07/20] Add read barrier definition

---
 common_power.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/common_power.h b/common_power.h
index e7caf9adf..e29d0f382 100644
--- a/common_power.h
+++ b/common_power.h
@@ -71,9 +71,11 @@
 #if defined(POWER8) || defined(POWER9)
 #define MB		__asm__ __volatile__ ("eieio":::"memory")
 #define WMB		__asm__ __volatile__ ("eieio":::"memory")
+#define RMB		__asm__ __volatile__ ("eieio":::"memory")
 #else
 #define MB		__asm__ __volatile__ ("sync")
 #define WMB		__asm__ __volatile__ ("sync")
+#define RMB		__asm__ __volatile__ ("sync")
 #endif
 
 #define INLINE inline

From 69b6e258d8d6fe6211a86b987204c350f4f62deb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 12:17:41 +0200
Subject: [PATCH 08/20] Add (empty) read barrier definition

---
 common_sparc.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common_sparc.h b/common_sparc.h
index f99972db9..85e29fffa 100644
--- a/common_sparc.h
+++ b/common_sparc.h
@@ -41,6 +41,7 @@
 
 #define MB	__asm__ __volatile__ ("nop")
 #define WMB	__asm__ __volatile__ ("nop")
+#define RMB	__asm__ __volatile__ ("nop")
 
 #ifndef ASSEMBLER
 

From db3226a64681173d9d785cd71153a110b2b2dcee Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 12:18:48 +0200
Subject: [PATCH 09/20] Add (empty) read barrier definition

---
 common_x86.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common_x86.h b/common_x86.h
index 99adc9f5b..ec928e236 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -47,6 +47,7 @@
 
 #define MB
 #define WMB
+#define RMB
 
 #ifdef C_SUN
 #define	__asm__ __asm

From a52bdd9d7b1b3e24d1eff9e52020c05cef6602dd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 12:22:35 +0200
Subject: [PATCH 10/20] Add (empty) read barrier definition

---
 common_x86_64.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/common_x86_64.h b/common_x86_64.h
index 958e9caed..0247674cd 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -63,13 +63,16 @@
 #ifdef __GNUC__
 #define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
 #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
+#define RMB
 #else
 #define MB do {} while (0)
 #define WMB do {} while (0)
+#define RMB
 #endif
 
 static void __inline blas_lock(volatile BLASULONG *address){
 
+	
 #ifndef C_MSVC
   int ret;
 #else

From f5efecb7caf9bd438eeeb3b53ebf96f9d8c38b61 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 12:24:10 +0200
Subject: [PATCH 11/20] Add (empty) read barrier definition

---
 common_zarch.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/common_zarch.h b/common_zarch.h
index b5503a7a4..442bae821 100644
--- a/common_zarch.h
+++ b/common_zarch.h
@@ -34,9 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define COMMON_ZARCH
 
 #define MB   
-//__asm__ __volatile__ ("dmb  ish" : : : "memory")
 #define WMB  
-//__asm__ __volatile__ ("dmb  ishst" : : : "memory")
+#define RMB
 
 
 #define INLINE inline

From f41600e66fef4481ab82fbbad89144a8a8cc0599 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 12:34:02 +0200
Subject: [PATCH 12/20] Add a read barrier in the traversing of the buffer list

Needed on systems with weak memory ordering - the inferior, partially working fix from #2544 was already removed in #2551
---
 driver/others/memory.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index a49fb1fa1..5abcbf3a4 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2741,6 +2741,7 @@ void *blas_memory_alloc(int procpos){
   LOCK_COMMAND(&alloc_lock);
 #endif
   do {
+	  RMB;
 #if defined(USE_OPENMP)	  
     if (!memory[position].used) { 
       blas_lock(&memory[position].lock);

From 5b0093b5fe21dbdea04e37a6b3f687282b7313fb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 14:58:52 +0200
Subject: [PATCH 13/20] Convert aligned moves to unaligned

should have no performance impact on reasonably modern cpus and fixes occasional crashes in actual user code.
---
 kernel/x86_64/copy_sse2.S | 186 +++++++++++++++++++-------------------
 1 file changed, 93 insertions(+), 93 deletions(-)

diff --git a/kernel/x86_64/copy_sse2.S b/kernel/x86_64/copy_sse2.S
index 200daafd9..a5ab2ea91 100644
--- a/kernel/x86_64/copy_sse2.S
+++ b/kernel/x86_64/copy_sse2.S
@@ -54,7 +54,7 @@
 #ifdef OPTERON
 #define LOAD(OFFSET, ADDR, REG)		xorps	REG, REG; addpd	OFFSET(ADDR), REG
 #else
-#define LOAD(OFFSET, ADDR, REG)		movaps	OFFSET(ADDR), REG
+#define LOAD(OFFSET, ADDR, REG)		movups	OFFSET(ADDR), REG
 #endif
 
 	PROLOGUE
@@ -104,14 +104,14 @@
 	sarq	$4, %rax
 	jle	.L13
 
-	movaps	-16 * SIZE(X), %xmm0
-	movaps	-14 * SIZE(X), %xmm1
-	movaps	-12 * SIZE(X), %xmm2
-	movaps	-10 * SIZE(X), %xmm3
-	movaps	 -8 * SIZE(X), %xmm4
-	movaps	 -6 * SIZE(X), %xmm5
-	movaps	 -4 * SIZE(X), %xmm6
-	movaps	 -2 * SIZE(X), %xmm7
+	movups	-16 * SIZE(X), %xmm0
+	movups	-14 * SIZE(X), %xmm1
+	movups	-12 * SIZE(X), %xmm2
+	movups	-10 * SIZE(X), %xmm3
+	movups	 -8 * SIZE(X), %xmm4
+	movups	 -6 * SIZE(X), %xmm5
+	movups	 -4 * SIZE(X), %xmm6
+	movups	 -2 * SIZE(X), %xmm7
 
 	decq	%rax
 	jle .L12
@@ -122,36 +122,36 @@
 	PREFETCHW (PREFETCHSIZE +  0) - PREOFFSET(Y)
 #endif
 
-	movaps	%xmm0, -16 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
 	LOAD( 0 * SIZE, X, %xmm0)
-	movaps	%xmm1, -14 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
 	LOAD( 2 * SIZE, X, %xmm1)
 
 #ifdef PREFETCH
 	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
 #endif
 
-	movaps	%xmm2, -12 * SIZE(Y)
+	movups	%xmm2, -12 * SIZE(Y)
 	LOAD( 4 * SIZE, X, %xmm2)
-	movaps	%xmm3, -10 * SIZE(Y)
+	movups	%xmm3, -10 * SIZE(Y)
 	LOAD( 6 * SIZE, X, %xmm3)
 
 #if defined(PREFETCHW) && !defined(FETCH128)
 	PREFETCHW (PREFETCHSIZE +  64) - PREOFFSET(Y)
 #endif
 
-	movaps	%xmm4, -8 * SIZE(Y)
+	movups	%xmm4, -8 * SIZE(Y)
 	LOAD( 8 * SIZE, X, %xmm4)
-	movaps	%xmm5, -6 * SIZE(Y)
+	movups	%xmm5, -6 * SIZE(Y)
 	LOAD(10 * SIZE, X, %xmm5)
 
 #if defined(PREFETCH) && !defined(FETCH128)
 	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
 #endif
 
-	movaps	%xmm6, -4 * SIZE(Y)
+	movups	%xmm6, -4 * SIZE(Y)
 	LOAD(12 * SIZE, X, %xmm6)
-	movaps	%xmm7, -2 * SIZE(Y)
+	movups	%xmm7, -2 * SIZE(Y)
 	LOAD(14 * SIZE, X, %xmm7)
 
 	subq	$-16 * SIZE, Y
@@ -161,14 +161,14 @@
 	ALIGN_3
 
 .L12:
-	movaps	%xmm0, -16 * SIZE(Y)
-	movaps	%xmm1, -14 * SIZE(Y)
-	movaps	%xmm2, -12 * SIZE(Y)
-	movaps	%xmm3, -10 * SIZE(Y)
-	movaps	%xmm4,  -8 * SIZE(Y)
-	movaps	%xmm5,  -6 * SIZE(Y)
-	movaps	%xmm6,  -4 * SIZE(Y)
-	movaps	%xmm7,  -2 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
+	movups	%xmm2, -12 * SIZE(Y)
+	movups	%xmm3, -10 * SIZE(Y)
+	movups	%xmm4,  -8 * SIZE(Y)
+	movups	%xmm5,  -6 * SIZE(Y)
+	movups	%xmm6,  -4 * SIZE(Y)
+	movups	%xmm7,  -2 * SIZE(Y)
 
 	subq	$-16 * SIZE, Y
 	subq	$-16 * SIZE, X
@@ -179,15 +179,15 @@
 	jle	.L14
 	ALIGN_3
 
-	movaps	-16 * SIZE(X), %xmm0
-	movaps	-14 * SIZE(X), %xmm1
-	movaps	-12 * SIZE(X), %xmm2
-	movaps	-10 * SIZE(X), %xmm3
+	movups	-16 * SIZE(X), %xmm0
+	movups	-14 * SIZE(X), %xmm1
+	movups	-12 * SIZE(X), %xmm2
+	movups	-10 * SIZE(X), %xmm3
 
-	movaps	%xmm0, -16 * SIZE(Y)
-	movaps	%xmm1, -14 * SIZE(Y)
-	movaps	%xmm2, -12 * SIZE(Y)
-	movaps	%xmm3, -10 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
+	movups	%xmm2, -12 * SIZE(Y)
+	movups	%xmm3, -10 * SIZE(Y)
 
 	addq	$8 * SIZE, X
 	addq	$8 * SIZE, Y
@@ -198,11 +198,11 @@
 	jle	.L15
 	ALIGN_3
 
-	movaps	-16 * SIZE(X), %xmm0
-	movaps	-14 * SIZE(X), %xmm1
+	movups	-16 * SIZE(X), %xmm0
+	movups	-14 * SIZE(X), %xmm1
 
-	movaps	%xmm0, -16 * SIZE(Y)
-	movaps	%xmm1, -14 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
 
 	addq	$4 * SIZE, X
 	addq	$4 * SIZE, Y
@@ -213,8 +213,8 @@
 	jle	.L16
 	ALIGN_3
 
-	movaps	-16 * SIZE(X), %xmm0
-	movaps	%xmm0, -16 * SIZE(Y)
+	movups	-16 * SIZE(X), %xmm0
+	movups	%xmm0, -16 * SIZE(Y)
 
 	addq	$2 * SIZE, X
 	addq	$2 * SIZE, Y
@@ -246,13 +246,13 @@
 	sarq	$4, %rax
 	jle	.L23
 
-	movaps	-15 * SIZE(X), %xmm1
-	movaps	-13 * SIZE(X), %xmm2
-	movaps	-11 * SIZE(X), %xmm3
-	movaps	 -9 * SIZE(X), %xmm4
-	movaps	 -7 * SIZE(X), %xmm5
-	movaps	 -5 * SIZE(X), %xmm6
-	movaps	 -3 * SIZE(X), %xmm7
+	movups	-15 * SIZE(X), %xmm1
+	movups	-13 * SIZE(X), %xmm2
+	movups	-11 * SIZE(X), %xmm3
+	movups	 -9 * SIZE(X), %xmm4
+	movups	 -7 * SIZE(X), %xmm5
+	movups	 -5 * SIZE(X), %xmm6
+	movups	 -3 * SIZE(X), %xmm7
 
 	decq	%rax
 	jle .L22
@@ -264,11 +264,11 @@
 #endif
 
 	SHUFPD_1 %xmm1, %xmm0
-	movaps	%xmm0, -16 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
 	LOAD(-1 * SIZE, X, %xmm0)
 
 	SHUFPD_1 %xmm2, %xmm1
-	movaps	%xmm1, -14 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
 	LOAD( 1 * SIZE, X, %xmm1)
 
 #ifdef PREFETCH
@@ -276,11 +276,11 @@
 #endif
 
 	SHUFPD_1 %xmm3, %xmm2
-	movaps	%xmm2, -12 * SIZE(Y)
+	movups	%xmm2, -12 * SIZE(Y)
 	LOAD( 3 * SIZE, X, %xmm2)
 
 	SHUFPD_1 %xmm4, %xmm3
-	movaps	%xmm3, -10 * SIZE(Y)
+	movups	%xmm3, -10 * SIZE(Y)
 	LOAD( 5 * SIZE, X, %xmm3)
 
 #if defined(PREFETCHW) && !defined(FETCH128)
@@ -288,11 +288,11 @@
 #endif
 
 	SHUFPD_1 %xmm5, %xmm4
-	movaps	%xmm4,  -8 * SIZE(Y)
+	movups	%xmm4,  -8 * SIZE(Y)
 	LOAD( 7 * SIZE, X, %xmm4)
 
 	SHUFPD_1 %xmm6, %xmm5
-	movaps	%xmm5, -6 * SIZE(Y)
+	movups	%xmm5, -6 * SIZE(Y)
 	LOAD( 9 * SIZE, X, %xmm5)
 
 #if defined(PREFETCH) && !defined(FETCH128)
@@ -300,11 +300,11 @@
 #endif
 
 	SHUFPD_1 %xmm7, %xmm6
-	movaps	%xmm6, -4 * SIZE(Y)
+	movups	%xmm6, -4 * SIZE(Y)
 	LOAD(11 * SIZE, X, %xmm6)
 
 	SHUFPD_1 %xmm0, %xmm7
-	movaps	%xmm7, -2 * SIZE(Y)
+	movups	%xmm7, -2 * SIZE(Y)
 	LOAD(13 * SIZE, X, %xmm7)
 
 	subq	$-16 * SIZE, X
@@ -315,26 +315,26 @@
 
 .L22:
 	SHUFPD_1 %xmm1, %xmm0
-	movaps	%xmm0, -16 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
 	LOAD(-1 * SIZE, X, %xmm0)
 
 	SHUFPD_1 %xmm2, %xmm1
-	movaps	%xmm1, -14 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
 
 	SHUFPD_1 %xmm3, %xmm2
-	movaps	%xmm2, -12 * SIZE(Y)
+	movups	%xmm2, -12 * SIZE(Y)
 	SHUFPD_1 %xmm4, %xmm3
-	movaps	%xmm3, -10 * SIZE(Y)
+	movups	%xmm3, -10 * SIZE(Y)
 
 	SHUFPD_1 %xmm5, %xmm4
-	movaps	%xmm4,  -8 * SIZE(Y)
+	movups	%xmm4,  -8 * SIZE(Y)
 	SHUFPD_1 %xmm6, %xmm5
-	movaps	%xmm5,  -6 * SIZE(Y)
+	movups	%xmm5,  -6 * SIZE(Y)
 
 	SHUFPD_1 %xmm7, %xmm6
-	movaps	%xmm6,  -4 * SIZE(Y)
+	movups	%xmm6,  -4 * SIZE(Y)
 	SHUFPD_1 %xmm0, %xmm7
-	movaps	%xmm7,  -2 * SIZE(Y)
+	movups	%xmm7,  -2 * SIZE(Y)
 
 	subq	$-16 * SIZE, X
 	subq	$-16 * SIZE, Y
@@ -345,24 +345,24 @@
 	jle	.L24
 	ALIGN_3
 
-	movaps	-15 * SIZE(X), %xmm1
-	movaps	-13 * SIZE(X), %xmm2
-	movaps	-11 * SIZE(X), %xmm3
-	movaps	 -9 * SIZE(X), %xmm8
+	movups	-15 * SIZE(X), %xmm1
+	movups	-13 * SIZE(X), %xmm2
+	movups	-11 * SIZE(X), %xmm3
+	movups	 -9 * SIZE(X), %xmm8
 
 	SHUFPD_1 %xmm1, %xmm0
-	movaps	%xmm0, -16 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
 
 	SHUFPD_1 %xmm2, %xmm1
-	movaps	%xmm1, -14 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
 
 	SHUFPD_1 %xmm3, %xmm2
-	movaps	%xmm2, -12 * SIZE(Y)
+	movups	%xmm2, -12 * SIZE(Y)
 
 	SHUFPD_1 %xmm8, %xmm3
-	movaps	%xmm3, -10 * SIZE(Y)
+	movups	%xmm3, -10 * SIZE(Y)
 
-	movaps	%xmm8, %xmm0
+	movups	%xmm8, %xmm0
 
 	addq	$8 * SIZE, X
 	addq	$8 * SIZE, Y
@@ -373,15 +373,15 @@
 	jle	.L25
 	ALIGN_3
 
-	movaps	-15 * SIZE(X), %xmm1
-	movaps	-13 * SIZE(X), %xmm2
+	movups	-15 * SIZE(X), %xmm1
+	movups	-13 * SIZE(X), %xmm2
 
 	SHUFPD_1 %xmm1, %xmm0
 	SHUFPD_1 %xmm2, %xmm1
 
-	movaps	%xmm0, -16 * SIZE(Y)
-	movaps	%xmm1, -14 * SIZE(Y)
-	movaps	%xmm2, %xmm0
+	movups	%xmm0, -16 * SIZE(Y)
+	movups	%xmm1, -14 * SIZE(Y)
+	movups	%xmm2, %xmm0
 
 	addq	$4 * SIZE, X
 	addq	$4 * SIZE, Y
@@ -392,10 +392,10 @@
 	jle	.L26
 	ALIGN_3
 
-	movaps	-15 * SIZE(X), %xmm1
+	movups	-15 * SIZE(X), %xmm1
 	SHUFPD_1 %xmm1, %xmm0
 
-	movaps	%xmm0, -16 * SIZE(Y)
+	movups	%xmm0, -16 * SIZE(Y)
 
 	addq	$2 * SIZE, X
 	addq	$2 * SIZE, Y
@@ -424,14 +424,14 @@
 	sarq	$4, %rax
 	jle	.L23
 
-	movaps	-16 * SIZE(X), %xmm0
-	movaps	-14 * SIZE(X), %xmm1
-	movaps	-12 * SIZE(X), %xmm2
-	movaps	-10 * SIZE(X), %xmm3
-	movaps	 -8 * SIZE(X), %xmm4
-	movaps	 -6 * SIZE(X), %xmm5
-	movaps	 -4 * SIZE(X), %xmm6
-	movaps	 -2 * SIZE(X), %xmm7
+	movups	-16 * SIZE(X), %xmm0
+	movups	-14 * SIZE(X), %xmm1
+	movups	-12 * SIZE(X), %xmm2
+	movups	-10 * SIZE(X), %xmm3
+	movups	 -8 * SIZE(X), %xmm4
+	movups	 -6 * SIZE(X), %xmm5
+	movups	 -4 * SIZE(X), %xmm6
+	movups	 -2 * SIZE(X), %xmm7
 
 	decq	%rax
 	jle .L22
@@ -515,16 +515,16 @@
 	jle	.L24
 	ALIGN_3
 
-	movaps	-16 * SIZE(X), %xmm0
+	movups	-16 * SIZE(X), %xmm0
 	movlps	%xmm0, -16 * SIZE(Y)
 	movhps	%xmm0, -15 * SIZE(Y)
-	movaps	-14 * SIZE(X), %xmm1
+	movups	-14 * SIZE(X), %xmm1
 	movlps	%xmm1, -14 * SIZE(Y)
 	movhps	%xmm1, -13 * SIZE(Y)
-	movaps	-12 * SIZE(X), %xmm2
+	movups	-12 * SIZE(X), %xmm2
 	movlps	%xmm2, -12 * SIZE(Y)
 	movhps	%xmm2, -11 * SIZE(Y)
-	movaps	-10 * SIZE(X), %xmm3
+	movups	-10 * SIZE(X), %xmm3
 	movlps	%xmm3, -10 * SIZE(Y)
 	movhps	%xmm3,  -9 * SIZE(Y)
 
@@ -537,10 +537,10 @@
 	jle	.L25
 	ALIGN_3
 
-	movaps	-16 * SIZE(X), %xmm0
+	movups	-16 * SIZE(X), %xmm0
 	movlps	%xmm0, -16 * SIZE(Y)
 	movhps	%xmm0, -15 * SIZE(Y)
-	movaps	-14 * SIZE(X), %xmm1
+	movups	-14 * SIZE(X), %xmm1
 	movlps	%xmm1, -14 * SIZE(Y)
 	movhps	%xmm1, -13 * SIZE(Y)
 
@@ -553,7 +553,7 @@
 	jle	.L26
 	ALIGN_3
 
-	movaps	-16 * SIZE(X), %xmm0
+	movups	-16 * SIZE(X), %xmm0
 	movlps	%xmm0, -16 * SIZE(Y)
 	movhps	%xmm0, -15 * SIZE(Y)
 

From 3eec7d382c72a47df6ff687a7994f6f04b0c064d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 15:56:31 +0200
Subject: [PATCH 14/20] ARMV7 does not support DMB ISHLD, use DMB ISH

---
 common_arm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common_arm.h b/common_arm.h
index ee691ad75..682315de5 100644
--- a/common_arm.h
+++ b/common_arm.h
@@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define MB   __asm__ __volatile__ ("dmb  ish" : : : "memory")
 #define WMB  __asm__ __volatile__ ("dmb  ishst" : : : "memory")
-#define RMB  __asm__ __volatile__ ("dmb  ishld" : : : "memory")
+#define RMB  __asm__ __volatile__ ("dmb  ish" : : : "memory")
 
 #endif
 

From 0f08f3efa62558e104ae3e71e0470c5cd286a1d2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 13 Apr 2020 22:46:12 +0200
Subject: [PATCH 15/20] Add a multithread test for x86_64

---
 .drone.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/.drone.yml b/.drone.yml
index 3bbd8fc88..300cf3254 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -166,3 +166,27 @@ steps:
     - make -C ctest $COMMON_FLAGS
     - make -C utest $COMMON_FLAGS
     - make -C cpp_thread_test dgemm_tester
+---
+kind: pipeline
+name: epyc_native_test
+
+platform:
+  os: linux
+  arch: amd64
+
+steps:
+- name: Build and Test
+  image: ubuntu:19.04
+  environment:
+    CC: gcc
+    COMMON_FLAGS: 'USE_OPENMP=1'
+  commands:
+    - echo "MAKE_FLAGS:= $COMMON_FLAGS"
+    - apt-get update -y
+    - apt-get install -y make $CC gfortran perl python g++
+    - $CC --version
+    - make QUIET_MAKE=1 $COMMON_FLAGS
+    - make -C test $COMMON_FLAGS
+    - make -C ctest $COMMON_FLAGS
+    - make -C utest $COMMON_FLAGS
+    - make -C cpp_thread_test dgemm_tester

From b969533703cc745f04e4fc99e7e80d181e7f24f1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 14 Apr 2020 10:53:28 +0200
Subject: [PATCH 16/20] Add drone.io badge, mention EMAG8180 support, reformat
 the DYNAMIC_ARCH paragraph

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index 04f43f4c7..61393bd8f 100644
--- a/README.md
+++ b/README.md
@@ -6,8 +6,11 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
 
 AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
 
+Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)
+
 [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
 
+
 ## Introduction
 
 OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
@@ -140,6 +143,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **ThunderX**: Optimized some Level-1 functions
 - **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
 - **TSV110**: Optimized some Level-3 helper functions
+- **EMAG 8180**: preliminary support based on A57
 
 #### PPC/PPC64
 
@@ -154,11 +158,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 ### Support for multiple targets in a single library
 
 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake.
+
 For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default.
+
 DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
 Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
+
 On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
+
 For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
+
 The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
 common code in the library, usually you will want to set this to the oldest model you expect to encounter.
 Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.

From 84a9614345d0030275230083fe4bc38e4531652d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 14 Apr 2020 19:18:35 +0200
Subject: [PATCH 17/20] try x86_64 test without openmp

---
 .drone.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.drone.yml b/.drone.yml
index 300cf3254..5686c5e41 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -179,7 +179,7 @@ steps:
   image: ubuntu:19.04
   environment:
     CC: gcc
-    COMMON_FLAGS: 'USE_OPENMP=1'
+    COMMON_FLAGS: 'USE_THREAD=1'
   commands:
     - echo "MAKE_FLAGS:= $COMMON_FLAGS"
     - apt-get update -y

From 579811fb6ae33e9b82b970300e1a1481985b6105 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Apr 2020 17:38:33 +0200
Subject: [PATCH 18/20] Move all 19.04-based jobs back to ubuntu 18.04

---
 .drone.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 5686c5e41..8b7ac3011 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -8,7 +8,7 @@ platform:
 
 steps:
 - name: Build and Test
-  image: ubuntu:19.04
+  image: ubuntu:18.04
   environment:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
@@ -32,7 +32,7 @@ platform:
 
 steps:
 - name: Build and Test
-  image: ubuntu:19.04
+  image: ubuntu:18.04
   environment:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
@@ -152,7 +152,7 @@ platform:
 
 steps:
 - name: Build and Test
-  image: ubuntu:19.04
+  image: ubuntu:18.04
   environment:
     CC: gcc
     COMMON_FLAGS: 'USE_OPENMP=1'
@@ -176,7 +176,7 @@ platform:
 
 steps:
 - name: Build and Test
-  image: ubuntu:19.04
+  image: ubuntu:18.04
   environment:
     CC: gcc
     COMMON_FLAGS: 'USE_THREAD=1'

From e8e8a6e60802596d1d9a037062ac40f4b1cad356 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Apr 2020 19:26:12 +0200
Subject: [PATCH 19/20] Restore USE_OPENMP in the x86 thread test

---
 .drone.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.drone.yml b/.drone.yml
index 8b7ac3011..b1c211d14 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -179,7 +179,7 @@ steps:
   image: ubuntu:18.04
   environment:
     CC: gcc
-    COMMON_FLAGS: 'USE_THREAD=1'
+    COMMON_FLAGS: 'USE_OPENMP=1'
   commands:
     - echo "MAKE_FLAGS:= $COMMON_FLAGS"
     - apt-get update -y

From 6b7ef6543a998ea9f1063873d04469503be38766 Mon Sep 17 00:00:00 2001
From: l00536773 <linzelong2@hisilicon.com>
Date: Thu, 16 Apr 2020 10:55:10 +0800
Subject: [PATCH 20/20] [OpenBLAS]: benchmark error of potrf [description]:
 when the matrix size goes higher than 5800 during the cpotrf test, error
 info, such as "Potrf info = 5679", will be returned on ARM64 and x86
 machines. Uplo = L & F. [solution]: changed the func for building the matrix
 so that the complex Hermitian matrix can stay positive definite during the
 computation. [dts]:

---
 benchmark/potrf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark/potrf.c b/benchmark/potrf.c
index 580e46072..cb4c23bab 100644
--- a/benchmark/potrf.c
+++ b/benchmark/potrf.c
@@ -193,14 +193,14 @@ int main(int argc, char *argv[]){
 	  a[((long)j + (long)j * (long)m) * 2 + 1] = 0.;
 
 	  for(i = j + 1; i < m; i++) {
-	    a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
+	    a[((long)i + (long)j * (long)m) * 2 + 0] = 0;
 	    a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
 	  }
 	}
       } else {
 	for (j = 0; j < m; j++) {
 	  for(i = 0; i < j; i++) {
-	    a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
+	    a[((long)i + (long)j * (long)m) * 2 + 0] = 0.;
 	    a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
 	  }