Merge pull request #1108 from ashwinyes/develop_20170203_thunderx2t99

Optimized Implementations for ThunderX2T99
9 years ago · ffc1d6c468
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -751,6 +751,10 @@ void blas_set_parameter(void)
  cgemm_q = 224;
  cgemm_r = 4096;

  zgemm_p = 128;
  zgemm_q = 112;
  zgemm_r = 4096;

  dgemm_prefetch_size_a = 3584;
  dgemm_prefetch_size_b = 512;
  dgemm_prefetch_size_c = 128;
--- a/interface/swap.c
+++ b/interface/swap.c
@@ -42,9 +42,13 @@
 #include "functable.h"
 #endif

 #if defined(THUNDERX2T99) || defined(VULCAN)
 // Multithreaded swap gives performance benefits in ThunderX2T99
 #else
 // Disable multi-threading as it does not show any performance
 // benefits. Keep the multi-threading code for the record.
 #undef SMP
 #endif

 #ifndef CBLAS

@@ -81,7 +85,6 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
  if (incy < 0) y -= (n - 1) * incy;

 #ifdef SMP

  //disable multi-thread when incx==0 or incy==0
  //In that case, the threads would be dependent.
  if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT))
--- a/kernel/arm64/KERNEL.THUNDERX2T99
+++ b/kernel/arm64/KERNEL.THUNDERX2T99
@@ -10,13 +10,29 @@ DCOPYKERNEL    = copy_thunderx2t99.c
 CCOPYKERNEL    = copy_thunderx2t99.c
 ZCOPYKERNEL    = copy_thunderx2t99.c

 SNRM2KERNEL    = snrm2_thunderx2t99.c
 CNRM2KERNEL    = cnrm2_thunderx2t99.S
 SSWAPKERNEL    = swap_thunderx2t99.S
 DSWAPKERNEL    = swap_thunderx2t99.S
 CSWAPKERNEL    = swap_thunderx2t99.S
 ZSWAPKERNEL    = swap_thunderx2t99.S

 ISAMAXKERNEL   = iamax_thunderx2t99.c
 IDAMAXKERNEL   = iamax_thunderx2t99.c
 ICAMAXKERNEL   = izamax_thunderx2t99.c
 IZAMAXKERNEL   = izamax_thunderx2t99.c

 SNRM2KERNEL    = scnrm2_thunderx2t99.c
 CNRM2KERNEL    = scnrm2_thunderx2t99.c
 #DNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
 #ZNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
 DNRM2KERNEL    = dznrm2_thunderx2t99.c
 ZNRM2KERNEL    = dznrm2_thunderx2t99.c

 DAXPYKERNEL    = daxpy_thunderx2t99.S

 DDOTKERNEL     = ddot_thunderx2t99.c

 DDOTKERNEL     = dot_thunderx2t99.c
 SDOTKERNEL     = dot_thunderx2t99.c
 CDOTKERNEL     = zdot_thunderx2t99.c
 ZDOTKERNEL     = zdot_thunderx2t99.c

 ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
 DGEMMKERNEL    = dgemm_kernel_8x4_thunderx2t99.S
@@ -29,3 +45,7 @@ endif
 ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
 CGEMMKERNEL    =  cgemm_kernel_8x4_thunderx2t99.S
 endif

 ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
 ZGEMMKERNEL    =  zgemm_kernel_4x4_thunderx2t99.S
 endif
--- a/kernel/arm64/cnrm2_thunderx2t99.S
+++ b/kernel/arm64/cnrm2_thunderx2t99.S
@@ -1,228 +0,0 @@
 /*******************************************************************************
 Copyright (c) 2017, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */

 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/

 #define TMPF	d16
 #define SSQ	s0
 #define SSQD	d0

 /******************************************************************************/

 .macro INIT
 	fmov	SSQD, xzr
 	fmov	d1, xzr
 	fmov	d2, xzr
 	fmov	d3, xzr
 	fmov	d4, xzr
 	fmov	d5, xzr
 	fmov	d6, xzr
 	fmov	d7, xzr
 .endm

 .macro KERNEL_F1
 	ldr	TMPF, [X]
 	add	X, X, #8
 	fcvtl	v16.2d, v16.2s
 	fmla	v0.2d, v16.2d, v16.2d
 .endm

 .macro KERNEL_F16
 	ldur	q16, [X]
 	ldur	q18, [X, #16]
 	ldur	q20, [X, #32]
 	ldur	q22, [X, #48]
 	ldur	q24, [X, #64]
 	ldur	q26, [X, #80]
 	ldur	q28, [X, #96]
 	ldur	q30, [X, #112]

 	add	X, X, #128

 	fcvtl2	v17.2d, v16.4s
 	fcvtl	v16.2d, v16.2s
 	fcvtl2	v19.2d, v18.4s
 	fcvtl	v18.2d, v18.2s
 	fcvtl2	v21.2d, v20.4s
 	fcvtl	v20.2d, v20.2s
 	fcvtl2	v23.2d, v22.4s
 	fcvtl	v22.2d, v22.2s
 	fcvtl2	v25.2d, v24.4s
 	fcvtl	v24.2d, v24.2s
 	fcvtl2	v27.2d, v26.4s
 	fcvtl	v26.2d, v26.2s
 	fcvtl2	v29.2d, v28.4s
 	fcvtl	v28.2d, v28.2s
 	fcvtl2	v31.2d, v30.4s
 	fcvtl	v30.2d, v30.2s

 	fmla	v0.2d, v16.2d, v16.2d
 	fmla	v1.2d, v17.2d, v17.2d
 	fmla	v2.2d, v18.2d, v18.2d
 	fmla	v3.2d, v19.2d, v19.2d
 	fmla	v4.2d, v20.2d, v20.2d
 	fmla	v5.2d, v21.2d, v21.2d
 	fmla	v6.2d, v22.2d, v22.2d
 	fmla	v7.2d, v23.2d, v23.2d

 	fmla	v0.2d, v24.2d, v24.2d
 	fmla	v1.2d, v25.2d, v25.2d
 	fmla	v2.2d, v26.2d, v26.2d
 	fmla	v3.2d, v27.2d, v27.2d
 	fmla	v4.2d, v28.2d, v28.2d
 	fmla	v5.2d, v29.2d, v29.2d
 	fmla	v6.2d, v30.2d, v30.2d
 	fmla	v7.2d, v31.2d, v31.2d

 	prfm	PLDL1KEEP, [X, #1024]
 	prfm	PLDL1KEEP, [X, #1024+64]
 .endm

 .macro KERNEL_F16_FINALIZE
 	fadd	v0.2d, v0.2d, v1.2d
 	fadd	v2.2d, v2.2d, v3.2d
 	fadd	v4.2d, v4.2d, v5.2d
 	fadd	v6.2d, v6.2d, v7.2d

 	fadd	v0.2d, v0.2d, v2.2d
 	fadd	v4.2d, v4.2d, v6.2d

 	fadd	v0.2d, v0.2d, v4.2d
 .endm

 .macro KERNEL_FINALIZE
 	faddp	SSQD, v0.2d
 .endm

 .macro INIT_S
 	lsl	INC_X, INC_X, #3
 .endm

 .macro KERNEL_S1
 	ldr	TMPF, [X]
 	add	X, X, INC_X
 	fcvtl	v16.2d, v16.2s
 	fmla	v0.2d, v16.2d, v16.2d
 .endm

 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/

 	PROLOGUE

 	INIT

 	cmp	N, xzr
 	ble	nrm2_kernel_zero
 	cmp	INC_X, xzr
 	ble	nrm2_kernel_zero
 	cmp	INC_X, #1
 	bne	nrm2_kernel_S_BEGIN

 nrm2_kernel_F_BEGIN:

 	asr	I, N, #4
 	cmp	I, xzr
 	beq	nrm2_kernel_S_BEGIN

 	.align 5
 nrm2_kernel_F16:

 	KERNEL_F16

 	subs	I, I, #1
 	bne	nrm2_kernel_F16

 	KERNEL_F16_FINALIZE

 nrm2_kernel_F1:

 	ands	I, N, #15
 	ble	nrm2_kernel_L999

 nrm2_kernel_F10:

 	KERNEL_F1

 	subs    I, I, #1
 	bne     nrm2_kernel_F10

 	b	nrm2_kernel_L999

 nrm2_kernel_S_BEGIN:

 	INIT_S

 	asr	I, N, #2
 	cmp	I, xzr
 	ble	nrm2_kernel_S1

 nrm2_kernel_S4:

 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1

 	subs	I, I, #1
 	bne	nrm2_kernel_S4

 nrm2_kernel_S1:

 	ands	I, N, #3
 	ble	nrm2_kernel_L999

 nrm2_kernel_S10:

 	KERNEL_S1

 	subs    I, I, #1
 	bne     nrm2_kernel_S10

 nrm2_kernel_L999:
 	KERNEL_FINALIZE
 	fsqrt	SSQD, SSQD
 	fcvt	SSQ, SSQD
 	ret

 nrm2_kernel_zero:
 	fmov	SSQ, wzr

 	ret

 	EPILOGUE
--- a/kernel/arm64/ddot_thunderx2t99.c
+++ b/kernel/arm64/ddot_thunderx2t99.c
@@ -1,269 +0,0 @@
 /***************************************************************************
 Copyright (c) 2017, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/


 #include "common.h"

 #include <arm_neon.h>

 #define	N	"x0"	/* vector length */
 #define	X	"x1"	/* "X" vector address */
 #define	INC_X	"x2"	/* "X" stride */
 #define	Y	"x3"	/* "Y" vector address */
 #define	INC_Y	"x4"	/* "Y" stride */
 #define J	"x5"	/* loop variable */

 #define REG0	"xzr"
 #define DOTF	"d0"
 #define TMPX	"d16"
 #define LD1VX	"{v16.d}[0]"
 #define TMPY	"d24"
 #define LD1VY	"{v24.d}[0]"
 #define SZ	"8"

 #if defined(SMP)
 extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
 	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
 	void *c, BLASLONG ldc, int (*function)(), int nthreads);
 #endif


 static FLOAT ddot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	FLOAT  dot = 0.0 ;

 	if ( n < 0 )  return(dot);

 	__asm__ __volatile__ (
 	"	mov	"N", %[N_]			\n"
 	"	mov	"X", %[X_]			\n"
 	"	mov	"INC_X", %[INCX_]		\n"
 	"	mov	"Y", %[Y_]			\n"
 	"	mov	"INC_Y", %[INCY_]		\n"
 	"	fmov	"DOTF", "REG0"			\n"
 	"	fmov	d1, "REG0"			\n"
 	"	fmov	d2, "REG0"			\n"
 	"	fmov	d3, "REG0"			\n"
 	"	fmov	d4, "REG0"			\n"
 	"	fmov	d5, "REG0"			\n"
 	"	fmov	d6, "REG0"			\n"
 	"	fmov	d7, "REG0"			\n"

 	"	cmp	"N", xzr			\n"
 	"	ble	.Ldot_kernel_L999		\n"

 	"	cmp	"INC_X", #1			\n"
 	"	bne	.Ldot_kernel_S_BEGIN		\n"
 	"	cmp	"INC_Y", #1			\n"
 	"	bne	.Ldot_kernel_S_BEGIN		\n"

 	".Ldot_kernel_F_BEGIN:				\n"
 	"	asr	"J", "N", #5			\n"
 	"	cmp	"J", xzr			\n"
 	"	beq	.Ldot_kernel_F1			\n"

 	"	.align 5				\n"
 	".Ldot_kernel_F32:				\n"
 	"	ldp	q16, q17, ["X"]			\n"
 	"	ldp	q24, q25, ["Y"]			\n"
 	"	ldp	q18, q19, ["X", #32]		\n"
 	"	ldp	q26, q27, ["Y", #32]		\n"
 	"	fmla	v0.2d, v16.2d, v24.2d		\n"
 	"	fmla	v1.2d, v17.2d, v25.2d		\n"
 	"	ldp	q20, q21, ["X", #64]		\n"
 	"	ldp	q28, q29, ["Y", #64]		\n"
 	"	fmla	v2.2d, v18.2d, v26.2d		\n"
 	"	fmla	v3.2d, v19.2d, v27.2d		\n"
 	"	ldp	q22, q23, ["X", #96]		\n"
 	"	ldp	q30, q31, ["Y", #96]		\n"
 	"	add	"Y", "Y", #128			\n"
 	"	add	"X", "X", #128			\n"
 	"	fmla	v4.2d, v20.2d, v28.2d		\n"
 	"	fmla	v5.2d, v21.2d, v29.2d		\n"
 	"	PRFM	PLDL1KEEP, ["X", #896]		\n"
 	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"
 	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"
 	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"
 	"	fmla	v6.2d, v22.2d, v30.2d		\n"
 	"	fmla	v7.2d, v23.2d, v31.2d		\n"

 	"	ldp	q16, q17, ["X"]			\n"
 	"	ldp	q24, q25, ["Y"]			\n"
 	"	ldp	q18, q19, ["X", #32]		\n"
 	"	ldp	q26, q27, ["Y", #32]		\n"
 	"	fmla	v0.2d, v16.2d, v24.2d		\n"
 	"	fmla	v1.2d, v17.2d, v25.2d		\n"
 	"	ldp	q20, q21, ["X", #64]		\n"
 	"	ldp	q28, q29, ["Y", #64]		\n"
 	"	fmla	v2.2d, v18.2d, v26.2d		\n"
 	"	fmla	v3.2d, v19.2d, v27.2d		\n"
 	"	ldp	q22, q23, ["X", #96]		\n"
 	"	ldp	q30, q31, ["Y", #96]		\n"
 	"	add	"Y", "Y", #128			\n"
 	"	add	"X", "X", #128			\n"
 	"	fmla	v4.2d, v20.2d, v28.2d		\n"
 	"	fmla	v5.2d, v21.2d, v29.2d		\n"
 	"	PRFM	PLDL1KEEP, ["X", #896]		\n"
 	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"
 	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"
 	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"
 	"	fmla	v6.2d, v22.2d, v30.2d		\n"
 	"	fmla	v7.2d, v23.2d, v31.2d		\n"

 	"	subs	"J", "J", #1			\n"
 	"	bne	.Ldot_kernel_F32		\n"

 	"	fadd	v0.2d, v0.2d, v1.2d		\n"
 	"	fadd	v2.2d, v2.2d, v3.2d		\n"
 	"	fadd	v4.2d, v4.2d, v5.2d		\n"
 	"	fadd	v6.2d, v6.2d, v7.2d		\n"
 	"	fadd	v0.2d, v0.2d, v2.2d		\n"
 	"	fadd	v4.2d, v4.2d, v6.2d		\n"
 	"	fadd	v0.2d, v0.2d, v4.2d		\n"
 	"	faddp	"DOTF", v0.2d			\n"

 	".Ldot_kernel_F1:				\n"
 	"	ands	"J", "N", #31			\n"
 	"	ble	.Ldot_kernel_L999		\n"

 	".Ldot_kernel_F10:				\n"
 	"	ldr	"TMPX", ["X"]			\n"
 	"	ldr	"TMPY", ["Y"]			\n"
 	"	add	"X", "X", #"SZ"			\n"
 	"	add	"Y", "Y", #"SZ"			\n"
 	"	fmadd	"DOTF", "TMPX", "TMPY", "DOTF"	\n"

 	"	subs	"J", "J", #1			\n"
 	"	bne	.Ldot_kernel_F10		\n"

 	"	b	.Ldot_kernel_L999		\n"

 	".Ldot_kernel_S_BEGIN:				\n"
 	"	lsl	"INC_X", "INC_X", #3		\n"
 	"	lsl	"INC_Y", "INC_Y", #3		\n"
 	"	asr	"J", "N", #2			\n"
 	"	cmp	"J", xzr			\n"
 	"	ble	.Ldot_kernel_S1			\n"

 	".Ldot_kernel_S4:				\n"
 	"	ld1	"LD1VX", ["X"], "INC_X"		\n"
 	"	ld1	"LD1VY", ["Y"], "INC_Y"		\n"
 	"	fmadd	"DOTF", "TMPX", "TMPY", "DOTF"	\n"
 	"	ld1	"LD1VX", ["X"], "INC_X"		\n"
 	"	ld1	"LD1VY", ["Y"], "INC_Y"		\n"
 	"	fmadd	"DOTF", "TMPX", "TMPY", "DOTF"	\n"
 	"	ld1	"LD1VX", ["X"], "INC_X"		\n"
 	"	ld1	"LD1VY", ["Y"], "INC_Y"		\n"
 	"	fmadd	"DOTF", "TMPX", "TMPY", "DOTF"	\n"
 	"	ld1	"LD1VX", ["X"], "INC_X"		\n"
 	"	ld1	"LD1VY", ["Y"], "INC_Y"		\n"
 	"	fmadd	"DOTF", "TMPX", "TMPY", "DOTF"	\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Ldot_kernel_S4			\n"

 	".Ldot_kernel_S1:				\n"
 	"	ands	"J", "N", #3			\n"
 	"	ble	.Ldot_kernel_L999		\n"

 	".Ldot_kernel_S10:				\n"
 	"	ld1	"LD1VX", ["X"], "INC_X"		\n"
 	"	ld1	"LD1VY", ["Y"], "INC_Y"		\n"
 	"	fmadd	"DOTF", "TMPX", "TMPY", "DOTF"	\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Ldot_kernel_S10		\n"

 	".Ldot_kernel_L999:				\n"
 	"	fmov	%[DOT_], "DOTF"			\n"

 	: [DOT_]  "=r" (dot)		//%0
 	: [N_]    "r"  (n),		//%1
 	  [X_]    "r"  (x),		//%2
 	  [INCX_] "r"  (inc_x),		//%3
 	  [Y_]    "r"  (y),		//%4
 	  [INCY_] "r"  (inc_y)		//%5
 	: "cc",
 	  "memory",
 	  "x0", "x1", "x2", "x3", "x4", "x5",
 	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
 	);

 	return(dot);
 }

 #if defined(SMP)
 static int ddot_thread_function(BLASLONG n, BLASLONG dummy0,
 	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
 	BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
 {
 	*result = ddot_compute(n, x, inc_x, y, inc_y);

 	return 0;
 }
 #endif

 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 #if defined(SMP)
 	int nthreads;
 	FLOAT dummy_alpha;
 #endif
 	FLOAT dot = 0.0;

 #if defined(SMP)
 	nthreads = num_cpu_avail(1);

 	if (inc_x == 0 || inc_y == 0)
 		nthreads = 1;

 	if (n <= 10000)
 		nthreads = 1;

 	if (nthreads == 1) {
 		dot = ddot_compute(n, x, inc_x, y, inc_y);
 	} else {
 		int mode, i;
 		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
 		FLOAT *ptr;

 		mode = BLAS_DOUBLE  | BLAS_REAL;

 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
 				   x, inc_x, y, inc_y, result, 0,
 				   ( void *)ddot_thread_function, nthreads);

 		ptr = (FLOAT *)result;
 		for (i = 0; i < nthreads; i++) {
 			dot = dot + (*ptr);
 			ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
 		}
 	}
 #else
 	dot = ddot_compute(n, x, inc_x, y, inc_y);
 #endif

 	return dot;
 }
--- a/kernel/arm64/dnrm2.S
+++ b/kernel/arm64/dnrm2.S
@@ -1,169 +0,0 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */

 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/

 #define TMPF	d6
 #define SSQ	d0
 #define TMPVF	{v6.d}[0]
 #define SZ	8

 /******************************************************************************/

 .macro KERNEL_F1
 	ldr	TMPF, [X], #SZ
 	fmul	TMPF, TMPF, TMPF
 	fadd	SSQ, SSQ, TMPF
 .endm

 .macro KERNEL_F8
 	ld1	{v1.2d, v2.2d}, [X], #32
 	fmla	v0.2d, v1.2d, v1.2d
 	fmla	v5.2d, v2.2d, v2.2d
 	ld1	{v3.2d, v4.2d}, [X], #32
 	fmla	v0.2d, v3.2d, v3.2d
 	fmla	v5.2d, v4.2d, v4.2d
 	PRFM	PLDL1KEEP, [X, #1024]
 .endm

 .macro nrm2_kernel_F8_FINALIZE
 	fadd	v0.2d, v0.2d, v5.2d
 	faddp	SSQ, v0.2d
 .endm

 .macro INIT_S
 	lsl	INC_X, INC_X, #3
 	ld1	TMPVF, [X], INC_X
 	fmul	SSQ, TMPF, TMPF
 .endm

 .macro KERNEL_S1
 	ld1	TMPVF, [X], INC_X
 	fmul	TMPF, TMPF, TMPF
 	fadd	SSQ, SSQ, TMPF
 .endm

 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/

 	PROLOGUE

 	fmov	SSQ, xzr
 	fmov	d5, SSQ

 	cmp	N, xzr
 	ble	nrm2_kernel_zero
 	cmp	INC_X, xzr
 	ble	nrm2_kernel_zero
 	cmp	INC_X, #1
 	bne	nrm2_kernel_S_BEGIN

 nrm2_kernel_F_BEGIN:

 	asr	I, N, #3
 	cmp	I, xzr
 	beq	nrm2_kernel_F1_INIT

 nrm2_kernel_F8:

 	KERNEL_F8

 	subs	I, I, #1
 	bne	nrm2_kernel_F8

 	nrm2_kernel_F8_FINALIZE

 nrm2_kernel_F1:

 	ands	I, N, #7
 	ble	nrm2_kernel_L999

 nrm2_kernel_F10:

 	KERNEL_F1

 	subs    I, I, #1
        bne     nrm2_kernel_F10

 	b	nrm2_kernel_L999

 nrm2_kernel_F1_INIT:

 	b	nrm2_kernel_F1

 nrm2_kernel_S_BEGIN:

 	INIT_S

 	subs	N, N, #1
 	ble	nrm2_kernel_L999

 	asr	I, N, #2
 	cmp	I, xzr
 	ble	nrm2_kernel_S1

 nrm2_kernel_S4:

 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1

 	subs	I, I, #1
 	bne	nrm2_kernel_S4

 nrm2_kernel_S1:

 	ands	I, N, #3
 	ble	nrm2_kernel_L999

 nrm2_kernel_S10:

 	KERNEL_S1

 	subs    I, I, #1
 	bne     nrm2_kernel_S10

 nrm2_kernel_L999:
 	fsqrt	SSQ, SSQ
 	ret

 nrm2_kernel_zero:
 	ret

 	EPILOGUE
--- a/kernel/arm64/dot_thunderx2t99.c
+++ b/kernel/arm64/dot_thunderx2t99.c
@@ -0,0 +1,423 @@
 /***************************************************************************
 Copyright (c) 2017, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/


 #include "common.h"

 #include <arm_neon.h>

 #if !defined(DSDOT)
 #define RETURN_TYPE	FLOAT
 #else
 #define RETURN_TYPE	double
 #endif

 #define N		"x0"	/* vector length */
 #define X		"x1"	/* "X" vector address */
 #define INC_X		"x2"	/* "X" stride */
 #define Y		"x3"	/* "Y" vector address */
 #define INC_Y		"x4"	/* "Y" stride */
 #define J		"x5"	/* loop variable */

 #if !defined(DOUBLE)
 #if !defined(DSDOT)
 #define REG0		"wzr"
 #define DOTF		"s0"
 #define TMPX		"s16"
 #define TMPY		"s24"
 #define INC_SHIFT	"2"
 #define N_DIV_SHIFT	"6"
 #define N_REM_MASK	"63"
 #else
 #define REG0		"xzr"
 #define DOTF		"d0"
 #define TMPX		"s16"
 #define TMPX1		"d2"
 #define TMPY		"s24"
 #define TMPY1		"d3"
 #define INC_SHIFT	"2"
 #define N_DIV_SHIFT	"4"
 #define N_REM_MASK	"15"
 #endif
 #else
 #define REG0		"xzr"
 #define DOTF		"d0"
 #define TMPX		"d16"
 #define TMPY		"d24"
 #define INC_SHIFT	"3"
 #define N_DIV_SHIFT	"5"
 #define N_REM_MASK	"31"
 #endif

 #if !defined(DOUBLE)

 #if !defined(DSDOT)
 #define KERNEL_F1						\
 	"	ldr	"TMPX", ["X"]			\n"	\
 	"	ldr	"TMPY", ["Y"]			\n"	\
 	"	add	"X", "X", "INC_X"		\n"	\
 	"	add	"Y", "Y", "INC_Y"		\n"	\
 	"	fmadd	"DOTF", "TMPX", "TMPY", "DOTF"  \n"

 #define KERNEL_F						\
 	"	ldp	q16, q17, ["X"]			\n"	\
 	"	ldp	q24, q25, ["Y"]			\n"	\
 	"	ldp	q18, q19, ["X", #32]		\n"	\
 	"	ldp	q26, q27, ["Y", #32]		\n"	\
 	"	fmla	v0.4s, v16.4s, v24.4s		\n"	\
 	"	fmla	v1.4s, v17.4s, v25.4s		\n"	\
 	"	ldp	q20, q21, ["X", #64]		\n"	\
 	"	ldp	q28, q29, ["Y", #64]		\n"	\
 	"	fmla	v2.4s, v18.4s, v26.4s		\n"	\
 	"	fmla	v3.4s, v19.4s, v27.4s		\n"	\
 	"	ldp	q22, q23, ["X", #96]		\n"	\
 	"	ldp	q30, q31, ["Y", #96]		\n"	\
 	"	add	"Y", "Y", #128			\n"	\
 	"	add	"X", "X", #128			\n"	\
 	"	fmla	v4.4s, v20.4s, v28.4s		\n"	\
 	"	fmla	v5.4s, v21.4s, v29.4s		\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\
 	"	fmla	v6.4s, v22.4s, v30.4s		\n"	\
 	"	fmla	v7.4s, v23.4s, v31.4s		\n"	\
 	"	ldp	q16, q17, ["X"]			\n"	\
 	"	ldp	q24, q25, ["Y"]			\n"	\
 	"	ldp	q18, q19, ["X", #32]		\n"	\
 	"	ldp	q26, q27, ["Y", #32]		\n"	\
 	"	fmla	v0.4s, v16.4s, v24.4s		\n"	\
 	"	fmla	v1.4s, v17.4s, v25.4s		\n"	\
 	"	ldp	q20, q21, ["X", #64]		\n"	\
 	"	ldp	q28, q29, ["Y", #64]		\n"	\
 	"	fmla	v2.4s, v18.4s, v26.4s		\n"	\
 	"	fmla	v3.4s, v19.4s, v27.4s		\n"	\
 	"	ldp	q22, q23, ["X", #96]		\n"	\
 	"	ldp	q30, q31, ["Y", #96]		\n"	\
 	"	add	"Y", "Y", #128			\n"	\
 	"	add	"X", "X", #128			\n"	\
 	"	fmla	v4.4s, v20.4s, v28.4s		\n"	\
 	"	fmla	v5.4s, v21.4s, v29.4s		\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\
 	"	fmla	v6.4s, v22.4s, v30.4s		\n"	\
 	"	fmla	v7.4s, v23.4s, v31.4s		\n"

 #define KERNEL_F_FINALIZE					\
 	"	fadd	v0.4s, v0.4s, v1.4s		\n"	\
 	"	fadd	v2.4s, v2.4s, v3.4s		\n"	\
 	"	fadd	v4.4s, v4.4s, v5.4s		\n"	\
 	"	fadd	v6.4s, v6.4s, v7.4s		\n"	\
 	"	fadd	v0.4s, v0.4s, v2.4s		\n"	\
 	"	fadd	v4.4s, v4.4s, v6.4s		\n"	\
 	"	fadd	v0.4s, v0.4s, v4.4s		\n"	\
 	"	faddp	v0.4s, v0.4s, v0.4s		\n"	\
 	"	faddp	v0.4s, v0.4s, v0.4s		\n"

 #else /* !defined(DSDOT) */
 #define KERNEL_F1						\
 	"	ldr	"TMPX", ["X"]			\n"	\
 	"	ldr	"TMPY", ["Y"]			\n"	\
 	"	add	"X", "X", "INC_X"		\n"	\
 	"	add	"Y", "Y", "INC_Y"		\n"	\
 	"	fcvt	"TMPX1", "TMPX"			\n"	\
 	"	fcvt	"TMPY1", "TMPY"			\n"	\
 	"	fmul	"TMPX1", "TMPX1", "TMPY1"	\n"	\
 	"	fadd	"DOTF", "DOTF", "TMPX1"		\n"


 #define KERNEL_F						\
 	"	ldp	q18, q19, ["X"]			\n"	\
 	"	ldp	q26, q27, ["Y"]			\n"	\
 	"	fcvtl	v16.2d, v18.2s			\n"	\
 	"	fcvtl2	v17.2d, v18.4s			\n"	\
 	"	fcvtl	v18.2d, v19.2s			\n"	\
 	"	fcvtl2	v19.2d, v19.4s			\n"	\
 	"	fcvtl	v24.2d, v26.2s			\n"	\
 	"	fcvtl2	v25.2d, v26.4s			\n"	\
 	"	fcvtl	v26.2d, v27.2s			\n"	\
 	"	fcvtl2	v27.2d, v27.4s			\n"	\
 	"	ldp	q22, q23, ["X", #32]		\n"	\
 	"	ldp	q30, q31, ["Y", #32]		\n"	\
 	"	fcvtl	v20.2d, v22.2s			\n"	\
 	"	fcvtl2	v21.2d, v22.4s			\n"	\
 	"	fcvtl	v22.2d, v23.2s			\n"	\
 	"	fcvtl2	v23.2d, v23.4s			\n"	\
 	"	fcvtl	v28.2d, v30.2s			\n"	\
 	"	fcvtl2	v29.2d, v30.4s			\n"	\
 	"	fcvtl	v30.2d, v31.2s			\n"	\
 	"	fcvtl2	v31.2d, v31.4s			\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\
 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\
 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\
 	"	fmla	v2.2d, v18.2d, v26.2d		\n"	\
 	"	fmla	v3.2d, v19.2d, v27.2d		\n"	\
 	"	add	"Y", "Y", #64			\n"	\
 	"	add	"X", "X", #64			\n"	\
 	"	fmla	v4.2d, v20.2d, v28.2d		\n"	\
 	"	fmla	v5.2d, v21.2d, v29.2d		\n"	\
 	"	fmla	v6.2d, v22.2d, v30.2d		\n"	\
 	"	fmla	v7.2d, v23.2d, v31.2d		\n"

 #define KERNEL_F_FINALIZE					\
 	"	fadd	v0.2d, v0.2d, v1.2d		\n"	\
 	"	fadd	v2.2d, v2.2d, v3.2d		\n"	\
 	"	fadd	v4.2d, v4.2d, v5.2d		\n"	\
 	"	fadd	v6.2d, v6.2d, v7.2d		\n"	\
 	"	fadd	v0.2d, v0.2d, v2.2d		\n"	\
 	"	fadd	v4.2d, v4.2d, v6.2d		\n"	\
 	"	fadd	v0.2d, v0.2d, v4.2d		\n"	\
 	"	faddp	"DOTF", v0.2d			\n"
 #endif /* !defined(DSDOT) */

 #else /* !defined(DOUBLE) */ 
 #define KERNEL_F1						\
 	"	ldr	"TMPX", ["X"]			\n"	\
 	"	ldr	"TMPY", ["Y"]			\n"	\
 	"	add	"X", "X", "INC_X"		\n"	\
 	"	add	"Y", "Y", "INC_Y"		\n"	\
 	"	fmadd	"DOTF", "TMPX", "TMPY", "DOTF"  \n"

 #define KERNEL_F						\
 	"	ldp	q16, q17, ["X"]			\n"	\
 	"	ldp	q24, q25, ["Y"]			\n"	\
 	"	ldp	q18, q19, ["X", #32]		\n"	\
 	"	ldp	q26, q27, ["Y", #32]		\n"	\
 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\
 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\
 	"	ldp	q20, q21, ["X", #64]		\n"	\
 	"	ldp	q28, q29, ["Y", #64]		\n"	\
 	"	fmla	v2.2d, v18.2d, v26.2d		\n"	\
 	"	fmla	v3.2d, v19.2d, v27.2d		\n"	\
 	"	ldp	q22, q23, ["X", #96]		\n"	\
 	"	ldp	q30, q31, ["Y", #96]		\n"	\
 	"	add	"Y", "Y", #128			\n"	\
 	"	add	"X", "X", #128			\n"	\
 	"	fmla	v4.2d, v20.2d, v28.2d		\n"	\
 	"	fmla	v5.2d, v21.2d, v29.2d		\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\
 	"	fmla	v6.2d, v22.2d, v30.2d		\n"	\
 	"	fmla	v7.2d, v23.2d, v31.2d		\n"	\
 	"	ldp	q16, q17, ["X"]			\n"	\
 	"	ldp	q24, q25, ["Y"]			\n"	\
 	"	ldp	q18, q19, ["X", #32]		\n"	\
 	"	ldp	q26, q27, ["Y", #32]		\n"	\
 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\
 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\
 	"	ldp	q20, q21, ["X", #64]		\n"	\
 	"	ldp	q28, q29, ["Y", #64]		\n"	\
 	"	fmla	v2.2d, v18.2d, v26.2d		\n"	\
 	"	fmla	v3.2d, v19.2d, v27.2d		\n"	\
 	"	ldp	q22, q23, ["X", #96]		\n"	\
 	"	ldp	q30, q31, ["Y", #96]		\n"	\
 	"	add	"Y", "Y", #128			\n"	\
 	"	add	"X", "X", #128			\n"	\
 	"	fmla	v4.2d, v20.2d, v28.2d		\n"	\
 	"	fmla	v5.2d, v21.2d, v29.2d		\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\
 	"	fmla	v6.2d, v22.2d, v30.2d		\n"	\
 	"	fmla	v7.2d, v23.2d, v31.2d		\n"

 #define KERNEL_F_FINALIZE					\
 	"	fadd	v0.2d, v0.2d, v1.2d		\n"	\
 	"	fadd	v2.2d, v2.2d, v3.2d		\n"	\
 	"	fadd	v4.2d, v4.2d, v5.2d		\n"	\
 	"	fadd	v6.2d, v6.2d, v7.2d		\n"	\
 	"	fadd	v0.2d, v0.2d, v2.2d		\n"	\
 	"	fadd	v4.2d, v4.2d, v6.2d		\n"	\
 	"	fadd	v0.2d, v0.2d, v4.2d		\n"	\
 	"	faddp	"DOTF", v0.2d			\n"
 #endif /* !defined(DOUBLE) */

 #if defined(SMP)
 extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
 	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
 	void *c, BLASLONG ldc, int (*function)(), int nthreads);
 #endif

 static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	RETURN_TYPE  dot = 0.0 ;

 	if ( n < 0 ) return dot;

 	__asm__ __volatile__ (
 	"	mov	"N", %[N_]			\n"
 	"	mov	"X", %[X_]			\n"
 	"	mov	"INC_X", %[INCX_]		\n"
 	"	mov	"Y", %[Y_]			\n"
 	"	mov	"INC_Y", %[INCY_]		\n"
 	"	fmov	"DOTF", "REG0"			\n"
 	"	fmov	d1, xzr				\n"
 	"	fmov	d2, xzr				\n"
 	"	fmov	d3, xzr				\n"
 	"	fmov	d4, xzr				\n"
 	"	fmov	d5, xzr				\n"
 	"	fmov	d6, xzr				\n"
 	"	fmov	d7, xzr				\n"
 	"	cmp	"N", xzr			\n"
 	"	ble	.Ldot_kernel_L999		\n"
 	"	cmp	"INC_X", #1			\n"
 	"	bne	.Ldot_kernel_S_BEGIN		\n"
 	"	cmp	"INC_Y", #1			\n"
 	"	bne	.Ldot_kernel_S_BEGIN		\n"

 	".Ldot_kernel_F_BEGIN:				\n"
 	"	lsl	"INC_X", "INC_X", "INC_SHIFT"	\n"
 	"	lsl	"INC_Y", "INC_Y", "INC_SHIFT"	\n"
 	"	asr	"J", "N", #"N_DIV_SHIFT"	\n"
 	"	cmp	"J", xzr			\n"
 	"	beq	.Ldot_kernel_F1			\n"

 	"	.align 5				\n"
 	".Ldot_kernel_F:				\n"
 	"	"KERNEL_F"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Ldot_kernel_F			\n"
 	"	"KERNEL_F_FINALIZE"			\n"

 	".Ldot_kernel_F1:				\n"
 	"	ands	"J", "N", #"N_REM_MASK"		\n"
 	"	ble	.Ldot_kernel_L999		\n"

 	".Ldot_kernel_F10:				\n"
 	"	"KERNEL_F1"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Ldot_kernel_F10		\n"
 	"	b	.Ldot_kernel_L999		\n"

 	".Ldot_kernel_S_BEGIN:				\n"
 	"	lsl	"INC_X", "INC_X", "INC_SHIFT"	\n"
 	"	lsl	"INC_Y", "INC_Y", "INC_SHIFT"	\n"
 	"	asr	"J", "N", #2			\n"
 	"	cmp	"J", xzr			\n"
 	"	ble	.Ldot_kernel_S1			\n"

 	".Ldot_kernel_S4:				\n"
 	"	"KERNEL_F1"				\n"
 	"	"KERNEL_F1"				\n"
 	"	"KERNEL_F1"				\n"
 	"	"KERNEL_F1"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Ldot_kernel_S4			\n"

 	".Ldot_kernel_S1:				\n"
 	"	ands	"J", "N", #3			\n"
 	"	ble	.Ldot_kernel_L999		\n"

 	".Ldot_kernel_S10:				\n"
 	"	"KERNEL_F1"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Ldot_kernel_S10		\n"

 	".Ldot_kernel_L999:				\n"
 	"	str	"DOTF", [%[DOT_]]		\n"

 	:
 	: [DOT_]  "r"  (&dot),		//%0
 	  [N_]    "r"  (n),		//%1
 	  [X_]    "r"  (x),		//%2
 	  [INCX_] "r"  (inc_x),		//%3
 	  [Y_]    "r"  (y),		//%4
 	  [INCY_] "r"  (inc_y)		//%5
 	: "cc",
 	  "memory",
 	  "x0", "x1", "x2", "x3", "x4", "x5",
 	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
 	);

 	return dot;
 }

 #if defined(SMP)
 static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
 	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
 	BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
 {
 	*(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y);

 	return 0;
 }
 #endif

 RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 #if defined(SMP)
 	int nthreads;
 	FLOAT dummy_alpha;
 #endif
 	RETURN_TYPE dot = 0.0;

 #if defined(SMP)
 	nthreads = num_cpu_avail(1);

 	if (inc_x == 0 || inc_y == 0)
 		nthreads = 1;

 	if (n <= 10000)
 		nthreads = 1;

 	if (nthreads == 1) {
 		dot = dot_compute(n, x, inc_x, y, inc_y);
 	} else {
 		int mode, i;
 		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
 		RETURN_TYPE *ptr;

 #if !defined(DOUBLE)
 		mode = BLAS_SINGLE  | BLAS_REAL;
 #else
 		mode = BLAS_DOUBLE  | BLAS_REAL;
 #endif

 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
 				   x, inc_x, y, inc_y, result, 0,
 				   ( void *)dot_thread_function, nthreads);

 		ptr = (RETURN_TYPE *)result;
 		for (i = 0; i < nthreads; i++) {
 			dot = dot + (*ptr);
 			ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2);
 		}
 	}
 #else
 	dot = dot_compute(n, x, inc_x, y, inc_y);
 #endif

 	return dot;
 }
--- a/kernel/arm64/dznrm2_thunderx2t99.c
+++ b/kernel/arm64/dznrm2_thunderx2t99.c
@@ -0,0 +1,384 @@
 /***************************************************************************
 Copyright (c) 2017, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/


 #include "common.h"

 #include <arm_neon.h>

 #if defined(SMP)
 extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
 	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
 	void *c, BLASLONG ldc, int (*function)(), int nthreads);
 #endif

 #define N		"x0"	/* vector length */
 #define X		"x1"	/* X vector address */
 #define INC_X		"x2"	/* X stride */
 #define J		"x3"	/* loop variable */
 #define K		"x4"	/* loop variable */

 #if !defined(COMPLEX)
 #define INC_SHIFT	"3"
 #define SZ		"8"
 #else
 #define INC_SHIFT	"4"
 #define SZ		"16"
 #endif

 #define SSQ		"d0"
 #define SCALE		"d1"
 #define REGZERO		"d5"
 #define REGONE		"d6"
 #define CUR_MAX		"d7"
 #define CUR_MAXINV	"d8"
 #define CUR_MAXINV_V	"v8.2d"
 #define CUR_MAX_V	"v8.2d"

 static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 		         double *ssq, double *scale)
 {
 	*ssq = 0.0;
 	*scale = 0.0;

 	if (n <= 0)  return;

 	__asm__ __volatile__ (
 	"	mov	"N", %[N_]				\n"
 	"	mov	"X", %[X_]				\n"
 	"	mov	"INC_X", %[INCX_]			\n"
 	"	fmov	"SCALE", xzr				\n"
 	"	fmov	"SSQ", #1.0				\n"
 	"	cmp	"N", xzr				\n"
 	"	ble	.Lnrm2_kernel_L999			\n"
 	"	cmp	"INC_X", xzr				\n"
 	"	ble	.Lnrm2_kernel_L999			\n"

 	".Lnrm2_kernel_F_BEGIN:					\n"
 	"	fmov	"REGZERO", xzr				\n"
 	"	fmov	"REGONE", #1.0				\n"
 	"	lsl	"INC_X", "INC_X", #"INC_SHIFT"		\n"
 	"	mov	"J", "N"				\n"
 	"	cmp	"J", xzr				\n"
 	"	beq	.Lnrm2_kernel_L999			\n"

 	".Lnrm2_kernel_F_ZERO_SKIP:				\n"
 	"	ldr	d4, ["X"]				\n"
 	"	fcmp	d4, "REGZERO"				\n"
 	"	bne	.Lnrm2_kernel_F_INIT			\n"
 #if defined(COMPLEX)
 	"	ldr	d4, ["X", #8]				\n"
 	"	fcmp	d4, "REGZERO"				\n"
 	"	bne	.Lnrm2_kernel_F_INIT_I			\n"
 #endif
 	"	add	"X", "X", "INC_X"			\n"
 	"	subs	"J", "J", #1				\n"
 	"	beq	.Lnrm2_kernel_L999			\n"
 	"	b	.Lnrm2_kernel_F_ZERO_SKIP		\n"

 	".Lnrm2_kernel_F_INIT:					\n"
 	"	ldr	d4, ["X"]				\n"
 	"	fabs	d4, d4					\n"
 	"	fmax	"CUR_MAX", "SCALE", d4			\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
 	"	fdiv	d4, d4, "CUR_MAX"			\n"
 	"	fmul	d4, d4, d4				\n"
 	"	fadd	"SSQ", "SSQ", d4			\n"
 	"	fmov	"SCALE", "CUR_MAX"			\n"
 #if defined(COMPLEX)
 	".Lnrm2_kernel_F_INIT_I:				\n"
 	"	ldr	d3, ["X", #8]				\n"
 	"	fabs	d3, d3					\n"
 	"	fmax	"CUR_MAX", "SCALE", d3			\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
 	"	fdiv	d3, d3, "CUR_MAX"			\n"
 	"	fmul	d3, d3, d3				\n"
 	"	fadd	"SSQ", "SSQ", d3			\n"
 	"	fmov	"SCALE", "CUR_MAX"			\n"
 #endif
 	"	add	"X", "X", "INC_X"			\n"
 	"	subs	"J", "J", #1				\n"
 	"	beq	.Lnrm2_kernel_L999			\n"

 	".Lnrm2_kernel_F_START:					\n"
 	"	cmp	"INC_X", #"SZ"				\n"
 	"	bne	.Lnrm2_kernel_F1			\n"
 	"	asr	"K", "J", #4				\n"
 	"	cmp	"K", xzr				\n"
 	"	beq	.Lnrm2_kernel_F1			\n"

 	".Lnrm2_kernel_F:					\n"
 	"	ldp	q16, q17,  ["X"]			\n"
 	"	ldp	q18, q19,  ["X", #32]			\n"
 	"	ldp	q20, q21,  ["X", #64]			\n"
 	"	ldp	q22, q23,  ["X", #96]			\n"
 	"	add	"X", "X", #128				\n"
 	"	fabs	v16.2d, v16.2d				\n"
 	"	fabs	v17.2d, v17.2d				\n"
 	"	fabs	v18.2d, v18.2d				\n"
 	"	fabs	v19.2d, v19.2d				\n"
 	"	fabs	v20.2d, v20.2d				\n"
 	"	fabs	v21.2d, v21.2d				\n"
 	"	fabs	v22.2d, v22.2d				\n"
 	"	fabs	v23.2d, v23.2d				\n"
 	"	fmaxp	v24.2d, v16.2d, v17.2d			\n"
 	"	fmaxp	v25.2d, v18.2d, v19.2d			\n"
 	"	fmaxp	v26.2d, v20.2d, v21.2d			\n"
 	"	fmaxp	v27.2d, v22.2d, v23.2d			\n"
 	"	fmaxp	v24.2d, v24.2d, v25.2d			\n"
 	"	fmaxp	v26.2d, v26.2d, v27.2d			\n"
 	"	fmaxp	v24.2d, v24.2d, v26.2d			\n"
 	"	fmaxp	v24.2d, v24.2d, v24.2d			\n"
 	"	fmax	"CUR_MAX", "SCALE", d24			\n"
 	"	fdiv	"CUR_MAXINV", "REGONE", "CUR_MAX"	\n"
 	"	//dup	"CUR_MAX_V", v7.d[0]			\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
 	"	dup	"CUR_MAXINV_V", v8.d[0]			\n"
 	"	fmul	v16.2d, v16.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v17.2d, v17.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v18.2d, v18.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v19.2d, v19.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v20.2d, v20.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v21.2d, v21.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v22.2d, v22.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v23.2d, v23.2d, "CUR_MAXINV_V"		\n"
 	"	//fdiv	v16.2d, v16.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v17.2d, v17.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v18.2d, v18.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v19.2d, v19.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v20.2d, v20.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v21.2d, v21.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v22.2d, v22.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v23.2d, v23.2d, "CUR_MAX_V"		\n"
 	"	fmul	v24.2d, v16.2d, v16.2d			\n"
 	"	fmul	v25.2d, v17.2d, v17.2d			\n"
 	"	fmul	v26.2d, v18.2d, v18.2d			\n"
 	"	fmul	v27.2d, v19.2d, v19.2d			\n"
 	"	fmla	v24.2d, v20.2d, v20.2d			\n"
 	"	fmla	v25.2d, v21.2d, v21.2d			\n"
 	"	fmla	v26.2d, v22.2d, v22.2d			\n"
 	"	fmla	v27.2d, v23.2d, v23.2d			\n"
 	"	fadd	v24.2d, v24.2d, v25.2d			\n"
 	"	fadd	v26.2d, v26.2d, v27.2d			\n"
 	"	fadd	v24.2d, v24.2d, v26.2d			\n"
 	"	faddp	d24, v24.2d				\n"
 	"	fadd	"SSQ", "SSQ", d24			\n"
 	"	fmov	"SCALE", "CUR_MAX"			\n"
 #if defined(COMPLEX)
 	"	ldp	q16, q17,  ["X"]			\n"
 	"	ldp	q18, q19,  ["X", #32]			\n"
 	"	ldp	q20, q21,  ["X", #64]			\n"
 	"	ldp	q22, q23,  ["X", #96]			\n"
 	"	add	"X", "X", #128				\n"
 	"	fabs	v16.2d, v16.2d				\n"
 	"	fabs	v17.2d, v17.2d				\n"
 	"	fabs	v18.2d, v18.2d				\n"
 	"	fabs	v19.2d, v19.2d				\n"
 	"	fabs	v20.2d, v20.2d				\n"
 	"	fabs	v21.2d, v21.2d				\n"
 	"	fabs	v22.2d, v22.2d				\n"
 	"	fabs	v23.2d, v23.2d				\n"
 	"	fmaxp	v24.2d, v16.2d, v17.2d			\n"
 	"	fmaxp	v25.2d, v18.2d, v19.2d			\n"
 	"	fmaxp	v26.2d, v20.2d, v21.2d			\n"
 	"	fmaxp	v27.2d, v22.2d, v23.2d			\n"
 	"	fmaxp	v24.2d, v24.2d, v25.2d			\n"
 	"	fmaxp	v26.2d, v26.2d, v27.2d			\n"
 	"	fmaxp	v24.2d, v24.2d, v26.2d			\n"
 	"	fmaxp	v24.2d, v24.2d, v24.2d			\n"
 	"	fmax	"CUR_MAX", "SCALE", d24			\n"
 	"	fdiv	"CUR_MAXINV", "REGONE", "CUR_MAX"	\n"
 	"	//dup	"CUR_MAX_V", v7.d[0]			\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
 	"	dup	"CUR_MAXINV_V", v8.d[0]			\n"
 	"	fmul	v16.2d, v16.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v17.2d, v17.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v18.2d, v18.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v19.2d, v19.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v20.2d, v20.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v21.2d, v21.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v22.2d, v22.2d, "CUR_MAXINV_V"		\n"
 	"	fmul	v23.2d, v23.2d, "CUR_MAXINV_V"		\n"
 	"	//fdiv	v16.2d, v16.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v17.2d, v17.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v18.2d, v18.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v19.2d, v19.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v20.2d, v20.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v21.2d, v21.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v22.2d, v22.2d, "CUR_MAX_V"		\n"
 	"	//fdiv	v23.2d, v23.2d, "CUR_MAX_V"		\n"
 	"	fmul	v24.2d, v16.2d, v16.2d			\n"
 	"	fmul	v25.2d, v17.2d, v17.2d			\n"
 	"	fmul	v26.2d, v18.2d, v18.2d			\n"
 	"	fmul	v27.2d, v19.2d, v19.2d			\n"
 	"	fmla	v24.2d, v20.2d, v20.2d			\n"
 	"	fmla	v25.2d, v21.2d, v21.2d			\n"
 	"	fmla	v26.2d, v22.2d, v22.2d			\n"
 	"	fmla	v27.2d, v23.2d, v23.2d			\n"
 	"	fadd	v24.2d, v24.2d, v25.2d			\n"
 	"	fadd	v26.2d, v26.2d, v27.2d			\n"
 	"	fadd	v24.2d, v24.2d, v26.2d			\n"
 	"	faddp	d24, v24.2d				\n"
 	"	fadd	"SSQ", "SSQ", d24			\n"
 	"	fmov	"SCALE", "CUR_MAX"			\n"
 #endif
 	"	subs	"K", "K", #1				\n"
 	"	bne	.Lnrm2_kernel_F				\n"

 	".Lnrm2_kernel_F_DONE:					\n"
 	"	ands	"J", "J", #15				\n"
 	"	beq	.Lnrm2_kernel_L999			\n"

 	".Lnrm2_kernel_F1:					\n"
 	"	ldr	d4, ["X"]				\n"
 	"	fabs	d4, d4					\n"
 	"	fmax	"CUR_MAX", "SCALE", d4			\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
 	"	fdiv	d4, d4, "CUR_MAX"			\n"
 	"	fmul	d4, d4, d4				\n"
 	"	fadd	"SSQ", "SSQ", d4			\n"
 	"	fmov	"SCALE", "CUR_MAX"			\n"
 #if defined(COMPLEX)
 	"	ldr	d3, ["X", #8]				\n"
 	"	fabs	d3, d3					\n"
 	"	fmax	"CUR_MAX", "SCALE", d3			\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
 	"	fdiv	d3, d3, "CUR_MAX"			\n"
 	"	fmul	d3, d3, d3				\n"
 	"	fadd	"SSQ", "SSQ", d3			\n"
 	"	fmov	"SCALE", "CUR_MAX"			\n"
 #endif
 	"	add	"X", "X", "INC_X"			\n"
 	"	subs	"J", "J", #1				\n"
 	"	bne	.Lnrm2_kernel_F1			\n"

 	".Lnrm2_kernel_L999:					\n"
 	"	str	"SSQ", [%[SSQ_]]			\n"
 	"	str	"SCALE", [%[SCALE_]]			\n"

 	:
 	: [SSQ_]    "r"  (ssq),			//%0
 	  [SCALE_]  "r"  (scale),		//%1
 	  [N_]      "r"  (n),			//%2
 	  [X_]      "r"  (x),			//%3
 	  [INCX_]   "r"  (inc_x)		//%4
 	: "cc",
 	  "memory",
 	  "x0", "x1", "x2", "x3", "x4", "x5",
 	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"
 	);

 }

 #if defined(SMP)
 static int nrm2_thread_function(BLASLONG n, BLASLONG dummy0,
 	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3,
 	BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
 {
 	nrm2_compute(n, x, inc_x, result, result + 1);

 	return 0;
 }
 #endif

 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 #if defined(SMP)
 	int nthreads;
 	FLOAT dummy_alpha[2];
 #endif
 	FLOAT ssq, scale;

 	if (n <= 0 || inc_x <= 0) return 0.0;

 #if defined(SMP)
 	nthreads = num_cpu_avail(1);

 	if (n <= 10000)
 		nthreads = 1;

 	if (nthreads == 1) {
 		nrm2_compute(n, x, inc_x, &ssq, &scale);
 	} else {
 		int mode, i;
 		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
 		double *ptr;

 #if !defined(COMPLEX)
 		mode = BLAS_DOUBLE  | BLAS_REAL;
 #else
 		mode = BLAS_DOUBLE  | BLAS_COMPLEX;
 #endif

 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
 				   x, inc_x, NULL, 0, result, 0,
 				   ( void *)nrm2_thread_function, nthreads);

 		scale = 0.0;
 		ssq = 1.0;
 		ptr = (double *)result;
 		for (i = 0; i < nthreads; i++) {
 			FLOAT cur_scale, cur_ssq;

 			cur_ssq = *ptr;
 			cur_scale = *(ptr + 1);

 			if (cur_scale != 0) {
 				if (cur_scale > scale) {
 					scale = (scale / cur_scale);
 					ssq = ssq * scale * scale;
 					ssq += cur_ssq;
 					scale = cur_scale;
 				} else {
 					cur_scale = (cur_scale / scale);
 					cur_ssq = cur_ssq * cur_scale * cur_scale;
 					ssq += cur_ssq;
 				}
 			}

 			ptr = (double *)(((char *)ptr) + sizeof(double) * 2);
 		}
 	}
 #else
 	nrm2_compute(n, x, inc_x, &ssq, &scale);
 #endif
 	ssq = sqrt(ssq) * scale;

 	return ssq;
 }
--- a/kernel/arm64/dznrm2_thunderx2t99_fast.c
+++ b/kernel/arm64/dznrm2_thunderx2t99_fast.c
@@ -36,54 +36,47 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
 	void *c, BLASLONG ldc, int (*function)(), int nthreads);
 #endif

 #define	N	"x0"	/* vector length */
 #define	X	"x1"	/* X vector address */
 #define	INC_X	"x2"	/* X stride */
 #define I	"x5"	/* loop variable */
 #define N		"x0"	/* vector length */
 #define X		"x1"	/* X vector address */
 #define INC_X		"x2"	/* X stride */
 #define J		"x5"	/* loop variable */

 #define TMPF		"d16"
 #define SSQ		"d0"

 #if !defined(COMPLEX)
 #define N_DIV_SHIFT	"5"
 #define N_REM_MASK	"31"
 #define INC_SHIFT	"3"
 #else
 #define N_DIV_SHIFT	"4"
 #define N_REM_MASK	"15"
 #define INC_SHIFT	"4"
 #endif

 #define TMPF	"s16"
 #define TMPFD	"d17"
 #define SSQD	"d0"

 #define KERNEL_F1					\
 	"ldr	"TMPF", ["X"], #4		\n"	\
 	"fcvt	"TMPFD", "TMPF"			\n"	\
 	"fmadd	"SSQD", "TMPFD", "TMPFD", "SSQD"\n"

 #define KERNEL_F32					\
 	"ldur	q16, ["X"]			\n"	\
 	"ldur	q18, ["X", #16]			\n"	\
 	"ldur	q20, ["X", #32]			\n"	\
 	"ldur	q22, ["X", #48]			\n"	\
 	"ldur	q24, ["X", #64]			\n"	\
 	"ldur	q26, ["X", #80]			\n"	\
 	"ldur	q28, ["X", #96]			\n"	\
 	"ldur	q30, ["X", #112]		\n"	\
 	"add	"X", "X", #128			\n"	\
 	"fcvtl2	v17.2d, v16.4s			\n"	\
 	"fcvtl	v16.2d, v16.2s			\n"	\
 	"fcvtl2	v19.2d, v18.4s			\n"	\
 	"fcvtl	v18.2d, v18.2s			\n"	\
 	"fcvtl2	v21.2d, v20.4s			\n"	\
 	"fcvtl	v20.2d, v20.2s			\n"	\
 	"fcvtl2	v23.2d, v22.4s			\n"	\
 	"fcvtl	v22.2d, v22.2s			\n"	\
 	"fcvtl2	v25.2d, v24.4s			\n"	\
 	"fcvtl	v24.2d, v24.2s			\n"	\
 	"fcvtl2	v27.2d, v26.4s			\n"	\
 	"fcvtl	v26.2d, v26.2s			\n"	\
 	"fcvtl2	v29.2d, v28.4s			\n"	\
 	"fcvtl	v28.2d, v28.2s			\n"	\
 	"fcvtl2	v31.2d, v30.4s			\n"	\
 	"fcvtl	v30.2d, v30.2s			\n"	\
 #define KERNEL_F					\
 	"ldp	q16, q17, ["X"]			\n"	\
 	"ldp	q18, q19, ["X", #32]		\n"	\
 	"ldp	q20, q21, ["X", #64]		\n"	\
 	"ldp	q22, q23, ["X", #96]		\n"	\
 	"ldp	q24, q25, ["X", #128]		\n"	\
 	"ldp	q26, q27, ["X", #160]		\n"	\
 	"ldp	q28, q29, ["X", #192]		\n"	\
 	"ldp	q30, q31, ["X", #224]		\n"	\
 	"add	"X", "X", #256			\n"	\
 	"fmla	v0.2d, v16.2d, v16.2d		\n"	\
 	"fmla	v1.2d, v17.2d, v17.2d		\n"	\
 	"fmla	v2.2d, v18.2d, v18.2d		\n"	\
 	"fmla	v3.2d, v19.2d, v19.2d		\n"	\
 	"prfm	PLDL1KEEP, ["X", #1024]		\n"	\
 	"prfm	PLDL1KEEP, ["X", #1024+64]	\n"	\
 	"fmla	v4.2d, v20.2d, v20.2d		\n"	\
 	"fmla	v5.2d, v21.2d, v21.2d		\n"	\
 	"fmla	v6.2d, v22.2d, v22.2d		\n"	\
 	"fmla	v7.2d, v23.2d, v23.2d		\n"	\
 	"prfm	PLDL1KEEP, ["X", #1024+128]	\n"	\
 	"prfm	PLDL1KEEP, ["X", #1024+192]	\n"	\
 	"fmla	v0.2d, v24.2d, v24.2d		\n"	\
 	"fmla	v1.2d, v25.2d, v25.2d		\n"	\
 	"fmla	v2.2d, v26.2d, v26.2d		\n"	\
@@ -91,11 +84,16 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
 	"fmla	v4.2d, v28.2d, v28.2d		\n"	\
 	"fmla	v5.2d, v29.2d, v29.2d		\n"	\
 	"fmla	v6.2d, v30.2d, v30.2d		\n"	\
 	"fmla	v7.2d, v31.2d, v31.2d		\n"	\
 	"prfm	PLDL1KEEP, ["X", #1024]		\n"	\
 	"prfm	PLDL1KEEP, ["X", #1024+64]	\n"
 	"fmla	v7.2d, v31.2d, v31.2d		\n"


 #if !defined(COMPLEX)
 #define KERNEL_F1					\
 	"ldr	"TMPF", ["X"]			\n"	\
 	"add	"X", "X", "INC_X"		\n"	\
 	"fmadd	"SSQ", "TMPF", "TMPF", "SSQ"	\n"

 #define KERNEL_F32_FINALIZE				\
 #define KERNEL_F_FINALIZE				\
 	"fadd	v0.2d, v0.2d, v1.2d		\n"	\
 	"fadd	v2.2d, v2.2d, v3.2d		\n"	\
 	"fadd	v4.2d, v4.2d, v5.2d		\n"	\
@@ -103,14 +101,28 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
 	"fadd	v0.2d, v0.2d, v2.2d		\n"	\
 	"fadd	v4.2d, v4.2d, v6.2d		\n"	\
 	"fadd	v0.2d, v0.2d, v4.2d		\n"	\
 	"faddp	"SSQD", v0.2d			\n"
 	"faddp	"SSQ", v0.2d			\n"

 #define KERNEL_S1					\
 	"ldr	"TMPF", ["X"]			\n"	\
 #define KERNEL_FINALIZE					\
 	""
 #else
 #define KERNEL_F1					\
 	"ldr	q16, ["X"]			\n"	\
 	"add	"X", "X", "INC_X"		\n"	\
 	"fcvt	"TMPFD", "TMPF"			\n"	\
 	"fmadd	"SSQD", "TMPFD", "TMPFD", "SSQD"\n"
 	"fmla	v0.2d, v16.2d, v16.2d		\n"

 #define KERNEL_F_FINALIZE				\
 	"fadd	v0.2d, v0.2d, v1.2d		\n"	\
 	"fadd	v2.2d, v2.2d, v3.2d		\n"	\
 	"fadd	v4.2d, v4.2d, v5.2d		\n"	\
 	"fadd	v6.2d, v6.2d, v7.2d		\n"	\
 	"fadd	v0.2d, v0.2d, v2.2d		\n"	\
 	"fadd	v4.2d, v4.2d, v6.2d		\n"	\
 	"fadd	v0.2d, v0.2d, v4.2d		\n"

 #define KERNEL_FINALIZE					\
 	"faddp	"SSQ", v0.2d			\n"
 #endif

 static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
@@ -122,7 +134,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	"	mov	"N", %[N_]			\n"
 	"	mov	"X", %[X_]			\n"
 	"	mov	"INC_X", %[INCX_]		\n"
 	"	fmov	"SSQD", xzr			\n"
 	"	fmov	"SSQ", xzr			\n"
 	"	fmov	d1, xzr				\n"
 	"	fmov	d2, xzr				\n"
 	"	fmov	d3, xzr				\n"
@@ -138,56 +150,58 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	"	bne	.Lnrm2_kernel_S_BEGIN		\n"

 	".Lnrm2_kernel_F_BEGIN:				\n"
 	"	asr	"I", "N", #6			\n"
 	"	cmp	"I", xzr			\n"
 	"	beq	.Lnrm2_kernel_S_BEGIN		\n"
 	"	lsl	"INC_X", "INC_X", #"INC_SHIFT"	\n"
 	"	asr	"J", "N", #"N_DIV_SHIFT"	\n"
 	"	cmp	"J", xzr			\n"
 	"	beq	.Lnrm2_kernel_F1		\n"

 	"	.align 5				\n"
 	".Lnrm2_kernel_F64:				\n"
 	"	"KERNEL_F32"				\n"
 	"	"KERNEL_F32"				\n"
 	"	subs	"I", "I", #1			\n"
 	"	bne	.Lnrm2_kernel_F64		\n"
 	"	"KERNEL_F32_FINALIZE"			\n"
 	".Lnrm2_kernel_F:				\n"
 	"	"KERNEL_F"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Lnrm2_kernel_F			\n"
 	"	"KERNEL_F_FINALIZE"			\n"

 	".Lnrm2_kernel_F1:				\n"
 	"	ands	"I", "N", #63			\n"
 	"	ands	"J", "N", #"N_REM_MASK"		\n"
 	"	ble	.Lnrm2_kernel_L999		\n"

 	".Lnrm2_kernel_F10:				\n"
 	"	"KERNEL_F1"				\n"
 	"	subs	"I", "I", #1			\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Lnrm2_kernel_F10		\n"
 	"	b	.Lnrm2_kernel_L999		\n"

 	".Lnrm2_kernel_S_BEGIN:				\n"
 	"	lsl	"INC_X", "INC_X", #2		\n"
 	"	asr	"I", "N", #2			\n"
 	"	cmp	"I", xzr			\n"
 	"	lsl	"INC_X", "INC_X", #"INC_SHIFT"	\n"
 	"	asr	"J", "N", #2			\n"
 	"	cmp	"J", xzr			\n"
 	"	ble	.Lnrm2_kernel_S1		\n"

 	".Lnrm2_kernel_S4:				\n"
 	"	"KERNEL_S1"				\n"
 	"	"KERNEL_S1"				\n"
 	"	"KERNEL_S1"				\n"
 	"	"KERNEL_S1"				\n"
 	"	subs	"I", "I", #1			\n"
 	"	"KERNEL_F1"				\n"
 	"	"KERNEL_F1"				\n"
 	"	"KERNEL_F1"				\n"
 	"	"KERNEL_F1"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Lnrm2_kernel_S4		\n"

 	".Lnrm2_kernel_S1:				\n"
 	"	ands	"I", "N", #3			\n"
 	"	ands	"J", "N", #3			\n"
 	"	ble	.Lnrm2_kernel_L999		\n"

 	".Lnrm2_kernel_S10:				\n"
 	"	"KERNEL_S1"				\n"
 	"	subs	"I", "I", #1			\n"
 	"	"KERNEL_F1"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Lnrm2_kernel_S10		\n"

 	".Lnrm2_kernel_L999:				\n"
 	"	fmov	%[RET_], "SSQD"			\n"
 	"	"KERNEL_FINALIZE"			\n"
 	"	str	"SSQ", [%[RET_]]		\n"

 	: [RET_]  "=r" (ret)		//%0
 	: [N_]    "r"  (n),		//%1
 	:
 	: [RET_]  "r"  (&ret),		//%0
 	  [N_]    "r"  (n),		//%1
 	  [X_]    "r"  (x),		//%2
 	  [INCX_] "r"  (inc_x)		//%3
 	: "cc",
@@ -214,13 +228,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 #if defined(SMP)
 	int nthreads;
 	FLOAT dummy_alpha;
 	FLOAT dummy_alpha[2];
 #endif
 	FLOAT nrm2 = 0.0;
 	double nrm2_double = 0.0;

 	if (n <= 0 || inc_x <= 0) return 0.0;
 	if (n == 1) return fabs(x[0]);

 #if defined(SMP)
 	nthreads = num_cpu_avail(1);
@@ -229,13 +241,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 		nthreads = 1;

 	if (nthreads == 1) {
 		nrm2_double = nrm2_compute(n, x, inc_x);
 		nrm2 = nrm2_compute(n, x, inc_x);
 	} else {
 		int mode, i;
 		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
 		double *ptr;

 		mode = BLAS_SINGLE  | BLAS_REAL;
 #if !defined(COMPLEX)
 		mode = BLAS_DOUBLE  | BLAS_REAL;
 #else
 		mode = BLAS_DOUBLE  | BLAS_COMPLEX;
 #endif

 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
 				   x, inc_x, NULL, 0, result, 0,
@@ -243,14 +259,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)

 		ptr = (double *)result;
 		for (i = 0; i < nthreads; i++) {
 			nrm2_double = nrm2_double + (*ptr) * (*ptr);
 			nrm2 = nrm2 + (*ptr);
 			ptr = (double *)(((char *)ptr) + sizeof(double) * 2);
 		}
 	}
 #else
 	nrm2_double = nrm2_compute(n, x, inc_x);
 	nrm2 = nrm2_compute(n, x, inc_x);
 #endif
 	nrm2 = sqrt(nrm2_double);
 	nrm2 = sqrt(nrm2);

 	return nrm2;
 }
--- a/kernel/arm64/iamax_thunderx2t99.c
+++ b/kernel/arm64/iamax_thunderx2t99.c
@@ -0,0 +1,380 @@
 /***************************************************************************
 Copyright (c) 2017, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include "common.h"

 #define N		"x0"	/* vector length */
 #define X		"x1"	/* "X" vector address */
 #define INC_X		"x2"	/* "X" stride */
 #define INDEX		"x3"	/* index of max/min value */
 #define Z		"x4"	/* vector index */
 #define J		"x5"	/* loop variable */

 #if !defined(DOUBLE)
 #define MAXF		"s0"
 #define TMPF0		"s1"
 #define TMPF1		"s4"
 #define N_KERNEL_SIZE	"64"
 #define SZ		"4"
 #define N_DIV_SHIFT	"6"
 #define N_REM_MASK	"63"
 #define INC_SHIFT	"2"
 #else
 #define MAXF		"d0"
 #define TMPF0		"d1"
 #define TMPF1		"d4"
 #define N_KERNEL_SIZE	"32"
 #define SZ		"8"
 #define N_DIV_SHIFT	"5"
 #define N_REM_MASK	"31"
 #define INC_SHIFT	"3"
 #endif

 /******************************************************************************/

 #if !defined(DOUBLE)
 #define KERNEL_F						\
 	"ldp	q2, q3, ["X"]				\n"	\
 	"ldp	q4, q5, ["X", #32]			\n"	\
 	"ldp	q6, q7, ["X", #64]			\n"	\
 	"ldp	q16, q17, ["X", #96]			\n"	\
 	"ldp	q18, q19, ["X", #128]			\n"	\
 	"ldp	q20, q21, ["X", #160]			\n"	\
 	"ldp	q22, q23, ["X", #192]			\n"	\
 	"ldp	q24, q25, ["X", #224]			\n"	\
 	"add	"X", "X", #256				\n"	\
 	"fabs	v2.4s, v2.4s				\n"	\
 	"fabs	v3.4s, v3.4s				\n"	\
 	"fabs	v4.4s, v4.4s				\n"	\
 	"fabs	v5.4s, v5.4s				\n"	\
 	"fabs	v6.4s, v6.4s				\n"	\
 	"fabs	v7.4s, v7.4s				\n"	\
 	"fabs	v16.4s, v16.4s				\n"	\
 	"fabs	v17.4s, v17.4s				\n"	\
 	"fabs	v18.4s, v18.4s				\n"	\
 	"fabs	v19.4s, v19.4s				\n"	\
 	"fabs	v20.4s, v20.4s				\n"	\
 	"fabs	v21.4s, v21.4s				\n"	\
 	"fabs	v22.4s, v22.4s				\n"	\
 	"fabs	v23.4s, v23.4s				\n"	\
 	"fabs	v24.4s, v24.4s				\n"	\
 	"fabs	v25.4s, v25.4s				\n"	\
 	"fmax	v2.4s, v2.4s, v3.4s			\n"	\
 	"fmax	v4.4s, v4.4s, v5.4s			\n"	\
 	"fmax	v6.4s, v6.4s, v7.4s			\n"	\
 	"fmax	v16.4s, v16.4s, v17.4s			\n"	\
 	"fmax	v18.4s, v18.4s, v19.4s			\n"	\
 	"fmax	v20.4s, v20.4s, v21.4s			\n"	\
 	"fmax	v22.4s, v22.4s, v23.4s			\n"	\
 	"fmax	v24.4s, v24.4s, v25.4s			\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024]			\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024+64]		\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024+128]		\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024+192]		\n"	\
 	"fmax	v2.4s, v2.4s, v4.4s			\n"	\
 	"fmax	v6.4s, v6.4s, v16.4s			\n"	\
 	"fmax	v18.4s, v18.4s, v20.4s			\n"	\
 	"fmax	v22.4s, v22.4s, v24.4s			\n"	\
 	"fmax	v2.4s, v2.4s, v6.4s			\n"	\
 	"fmax	v18.4s, v18.4s, v22.4s			\n"	\
 	"fmax	v2.4s, v2.4s, v18.4s			\n"	\
 	"fmaxv	"TMPF0", v2.4s				\n"	\
 	"fcmp	"MAXF", "TMPF0"				\n"	\
 	"fcsel	"MAXF", "MAXF", "TMPF0", ge		\n"	\
 	"csel	"INDEX", "INDEX", "Z", ge		\n"	\
 	"add	"Z", "Z", #"N_KERNEL_SIZE"		\n"

 #else

 #define KERNEL_F						\
 	"ldp	q2, q3, ["X"]				\n"	\
 	"ldp	q4, q5, ["X", #32]			\n"	\
 	"ldp	q6, q7, ["X", #64]			\n"	\
 	"ldp	q16, q17, ["X", #96]			\n"	\
 	"ldp	q18, q19, ["X", #128]			\n"	\
 	"ldp	q20, q21, ["X", #160]			\n"	\
 	"ldp	q22, q23, ["X", #192]			\n"	\
 	"ldp	q24, q25, ["X", #224]			\n"	\
 	"add	"X", "X", #256				\n"	\
 	"fabs	v2.2d, v2.2d				\n"	\
 	"fabs	v3.2d, v3.2d				\n"	\
 	"fabs	v4.2d, v4.2d				\n"	\
 	"fabs	v5.2d, v5.2d				\n"	\
 	"fabs	v6.2d, v6.2d				\n"	\
 	"fabs	v7.2d, v7.2d				\n"	\
 	"fabs	v16.2d, v16.2d				\n"	\
 	"fabs	v17.2d, v17.2d				\n"	\
 	"fabs	v18.2d, v18.2d				\n"	\
 	"fabs	v19.2d, v19.2d				\n"	\
 	"fabs	v20.2d, v20.2d				\n"	\
 	"fabs	v21.2d, v21.2d				\n"	\
 	"fabs	v22.2d, v22.2d				\n"	\
 	"fabs	v23.2d, v23.2d				\n"	\
 	"fabs	v24.2d, v24.2d				\n"	\
 	"fabs	v25.2d, v25.2d				\n"	\
 	"fmax	v2.2d, v2.2d, v3.2d			\n"	\
 	"fmax	v4.2d, v4.2d, v5.2d			\n"	\
 	"fmax	v6.2d, v6.2d, v7.2d			\n"	\
 	"fmax	v16.2d, v16.2d, v17.2d			\n"	\
 	"fmax	v18.2d, v18.2d, v19.2d			\n"	\
 	"fmax	v20.2d, v20.2d, v21.2d			\n"	\
 	"fmax	v22.2d, v22.2d, v23.2d			\n"	\
 	"fmax	v24.2d, v24.2d, v25.2d			\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024]			\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024+64]		\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024+128]		\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024+192]		\n"	\
 	"fmax	v2.2d, v2.2d, v4.2d			\n"	\
 	"fmax	v6.2d, v6.2d, v16.2d			\n"	\
 	"fmax	v18.2d, v18.2d, v20.2d			\n"	\
 	"fmax	v22.2d, v22.2d, v24.2d			\n"	\
 	"fmax	v2.2d, v2.2d, v6.2d			\n"	\
 	"fmax	v18.2d, v18.2d, v22.2d			\n"	\
 	"fmax	v2.2d, v2.2d, v18.2d			\n"	\
 	"ins	v3.d[0], v2.d[1]			\n"	\
 	"fmax	"TMPF0", d3, d2				\n"	\
 	"fcmp	"MAXF", "TMPF0"				\n"	\
 	"fcsel	"MAXF", "MAXF", "TMPF0", ge		\n"	\
 	"csel	"INDEX", "INDEX", "Z", ge		\n"	\
 	"add	"Z", "Z", #"N_KERNEL_SIZE"		\n"
 #endif

 #define KERNEL_F_FINALIZE					\
 	"sub	x6, "INDEX", #1				\n"	\
 	"lsl	x6, x6, #"INC_SHIFT" 			\n"	\
 	"add	x7, x7, x6				\n"	\
 	"mov	x6, #0					\n"	\
 	"1:						\n"	\
 	"add	x6, x6, #1				\n"	\
 	"cmp	x6, #"N_KERNEL_SIZE"			\n"	\
 	"bge	2f					\n"	\
 	"ldr	"TMPF1", [x7] 				\n"	\
 	"fabs	"TMPF1", "TMPF1"			\n"	\
 	"fcmp	"MAXF", "TMPF1"				\n"	\
 	"add	x7, x7, #"SZ"				\n"	\
 	"bne	1b					\n"	\
 	"2:						\n"	\
 	"sub	x6, x6, #1				\n"	\
 	"add	"INDEX", "INDEX", x6			\n"


 #define INIT							\
 	"lsl	"INC_X", "INC_X", #"INC_SHIFT"		\n"	\
 	"ldr	"MAXF", ["X"]				\n"	\
 	"add	"X", "X", "INC_X"			\n"	\
 	"mov	"Z", #1					\n"	\
 	"mov	"INDEX", "Z"				\n"	\
 	"fabs	"MAXF", "MAXF"				\n"


 #define KERNEL_S1						\
 	"ldr	"TMPF0", ["X"]				\n"	\
 	"add	"X", "X", "INC_X"			\n"	\
 	"add	"Z", "Z", #1				\n"	\
 	"fabs	"TMPF0", "TMPF0"			\n"	\
 	"fcmp	"MAXF", "TMPF0"				\n"	\
 	"fcsel	"MAXF", "MAXF", "TMPF0", ge		\n"	\
 	"csel	"INDEX", "INDEX", "Z", ge		\n"


 #if defined(SMP)
 extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
 	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
 	void *c, BLASLONG ldc, int (*function)(), int nthreads);
 #endif


 static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG index = 0;

 	if ( n < 0 )  return index;

 	__asm__ __volatile__ (
 	"	mov	"N", %[N_]				\n"
 	"	mov	"X", %[X_]				\n"
 	"	mov	"INC_X", %[INCX_]			\n"

 	"	cmp	"N", xzr				\n"
 	"	ble	.Liamax_kernel_zero			\n"
 	"	cmp	"INC_X", xzr				\n"
 	"	ble	.Liamax_kernel_zero			\n"
 	"	cmp	"INC_X", #1				\n"
 	"	bne	.Liamax_kernel_S_BEGIN			\n"
 	"	mov	x7, "X"					\n"

 	".Liamax_kernel_F_BEGIN:				\n"
 	"	"INIT"						\n"
 	"	subs	"N", "N", #1				\n"
 	"	ble	.Liamax_kernel_L999			\n"
 	"	asr	"J", "N", #"N_DIV_SHIFT"		\n"
 	"	cmp	"J", xzr				\n"
 	"	beq	.Liamax_kernel_F1			\n"
 	"	add	"Z", "Z", #1				\n"

 	".Liamax_kernel_F:					\n"
 	"	"KERNEL_F"					\n"
 	"	subs	"J", "J", #1				\n"
 	"	bne	.Liamax_kernel_F			\n"
 	"	"KERNEL_F_FINALIZE"				\n"
 	"	sub	"Z", "Z", #1				\n"

 	".Liamax_kernel_F1:					\n"
 	"	ands	"J", "N", #"N_REM_MASK"			\n"
 	"	ble	.Liamax_kernel_L999			\n"

 	".Liamax_kernel_F10:					\n"
 	"	"KERNEL_S1"					\n"
 	"	subs	"J", "J", #1				\n"
 	"	bne	.Liamax_kernel_F10			\n"
 	"	b	.Liamax_kernel_L999			\n"

 	".Liamax_kernel_S_BEGIN:				\n"
 	"	"INIT"						\n"
 	"	subs	"N", "N", #1				\n"
 	"	ble	.Liamax_kernel_L999			\n"
 	"	asr	"J", "N", #2				\n"
 	"	cmp	"J", xzr				\n"
 	"	ble	.Liamax_kernel_S1			\n"

 	".Liamax_kernel_S4:					\n"
 	"	"KERNEL_S1"					\n"
 	"	"KERNEL_S1"					\n"
 	"	"KERNEL_S1"					\n"
 	"	"KERNEL_S1"					\n"
 	"	subs	"J", "J", #1				\n"
 	"	bne	.Liamax_kernel_S4			\n"

 	".Liamax_kernel_S1:					\n"
 	"	ands	"J", "N", #3				\n"
 	"	ble	.Liamax_kernel_L999			\n"

 	".Liamax_kernel_S10:					\n"
 	"	"KERNEL_S1"					\n"
 	"	subs	"J", "J", #1				\n"
 	"	bne	.Liamax_kernel_S10			\n"

 	".Liamax_kernel_L999:					\n"
 	"	mov	x0, "INDEX"				\n"
 	"	b	.Liamax_kernel_DONE			\n"

 	".Liamax_kernel_zero:					\n"
 	"	mov	x0, xzr					\n"

 	".Liamax_kernel_DONE:					\n"
 	"	mov	%[INDEX_], "INDEX"			\n"

 	: [INDEX_] "=r" (index)		//%0
 	: [N_]    "r"  (n),		//%1
 	  [X_]    "r"  (x),		//%2
 	  [INCX_] "r"  (inc_x)		//%3
 	: "cc",
 	  "memory",
 	  "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
 	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
 	);

 	return index;
 }

 #if defined(SMP)
 static int iamax_thread_function(BLASLONG n, BLASLONG dummy0,
 	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
 	BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
 {
 	*(BLASLONG *)result = iamax_compute(n, x, inc_x);

 	return 0;
 }
 #endif

 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 #if defined(SMP)
 	int nthreads;
 	FLOAT dummy_alpha;
 #endif
 	BLASLONG max_index = 0;

 #if defined(SMP)
 	nthreads = num_cpu_avail(1);

 	if (inc_x == 0)
 		nthreads = 1;

 	if (n <= 10000)
 		nthreads = 1;

 	if (nthreads == 1) {
 		max_index = iamax_compute(n, x, inc_x);
 	} else {
 		BLASLONG i, width, cur_index;
 		int num_cpu;
 		int mode;
 		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
 		FLOAT max = -1.0;

 #if !defined(DOUBLE)
 		mode = BLAS_SINGLE;
 #else
 		mode = BLAS_DOUBLE;
 #endif

 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
 				   x, inc_x, NULL, 0, result, 0,
 				   ( void *)iamax_thread_function, nthreads);

 		num_cpu = 0;
 		i = n;
 		cur_index = 0;

 		while (i > 0) {
 			FLOAT elem;
 			BLASLONG cur_max_index;

 			cur_max_index = *(BLASLONG *)&result[num_cpu * sizeof(double) * 2];
 			elem = x[((cur_index + cur_max_index - 1) * inc_x)];
 			elem = fabs(elem);

 			if (elem >= max) {
 				max = elem;
 				max_index = cur_index + cur_max_index;
 			}

 			width = blas_quickdivide(i + nthreads - num_cpu - 1,
 				 nthreads - num_cpu);
 			i -= width;
 			cur_index += width;
 			num_cpu ++;
 		}
 	}
 #else
 	max_index = iamax_compute(n, x, inc_x);
 #endif

 	return max_index;
 }
--- a/kernel/arm64/izamax_thunderx2t99.c
+++ b/kernel/arm64/izamax_thunderx2t99.c
@@ -0,0 +1,390 @@
 /***************************************************************************
 Copyright (c) 2017, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include "common.h"

 #define N		"x0"	/* vector length */
 #define X		"x1"	/* "X" vector address */
 #define INC_X		"x2"	/* "X" stride */
 #define INDEX		"x3"	/* index of max/min value */
 #define Z		"x4"	/* vector index */
 #define J		"x5"	/* loop variable */

 #if !defined(DOUBLE)
 #define MAXF		"s0"
 #define TMPF0		"s1"
 #define TMPF0V		"v1.2s"
 #define TMPF1		"d4"
 #define TMPF1V		"v4.2s"
 #define N_KERNEL_SIZE	"32"
 #define SZ		"8"
 #define N_DIV_SHIFT	"5"
 #define N_REM_MASK	"31"
 #define INC_SHIFT	"3"
 #else
 #define MAXF		"d0"
 #define TMPF0		"d1"
 #define TMPF0V		"v1.2d"
 #define TMPF1		"q4"
 #define TMPF1V		"v4.2d"
 #define N_KERNEL_SIZE	"16"
 #define SZ		"16"
 #define N_DIV_SHIFT	"4"
 #define N_REM_MASK	"15"
 #define INC_SHIFT	"4"
 #endif

 /******************************************************************************/

 #if !defined(DOUBLE)
 #define KERNEL_F						\
 	"ldp	q2, q3, ["X"]				\n"	\
 	"ldp	q4, q5, ["X", #32]			\n"	\
 	"ldp	q6, q7, ["X", #64]			\n"	\
 	"ldp	q16, q17, ["X", #96]			\n"	\
 	"ldp	q18, q19, ["X", #128]			\n"	\
 	"ldp	q20, q21, ["X", #160]			\n"	\
 	"ldp	q22, q23, ["X", #192]			\n"	\
 	"ldp	q24, q25, ["X", #224]			\n"	\
 	"add	"X", "X", #256				\n"	\
 	"fabs	v2.4s, v2.4s				\n"	\
 	"fabs	v3.4s, v3.4s				\n"	\
 	"fabs	v4.4s, v4.4s				\n"	\
 	"fabs	v5.4s, v5.4s				\n"	\
 	"fabs	v6.4s, v6.4s				\n"	\
 	"fabs	v7.4s, v7.4s				\n"	\
 	"fabs	v16.4s, v16.4s				\n"	\
 	"fabs	v17.4s, v17.4s				\n"	\
 	"fabs	v18.4s, v18.4s				\n"	\
 	"fabs	v19.4s, v19.4s				\n"	\
 	"fabs	v20.4s, v20.4s				\n"	\
 	"fabs	v21.4s, v21.4s				\n"	\
 	"fabs	v22.4s, v22.4s				\n"	\
 	"fabs	v23.4s, v23.4s				\n"	\
 	"fabs	v24.4s, v24.4s				\n"	\
 	"fabs	v25.4s, v25.4s				\n"	\
 	"faddp	v2.4s, v2.4s, v3.4s			\n"	\
 	"faddp	v4.4s, v4.4s, v5.4s			\n"	\
 	"faddp	v6.4s, v6.4s, v7.4s			\n"	\
 	"faddp	v16.4s, v16.4s, v17.4s			\n"	\
 	"faddp	v18.4s, v18.4s, v19.4s			\n"	\
 	"faddp	v20.4s, v20.4s, v21.4s			\n"	\
 	"faddp	v22.4s, v22.4s, v23.4s			\n"	\
 	"faddp	v24.4s, v24.4s, v25.4s			\n"	\
 	"fmax	v2.4s, v2.4s, v4.4s			\n"	\
 	"fmax	v6.4s, v6.4s, v16.4s			\n"	\
 	"fmax	v18.4s, v18.4s, v20.4s			\n"	\
 	"fmax	v22.4s, v22.4s, v24.4s			\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024]			\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024+64]		\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024+128]		\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024+192]		\n"	\
 	"fmax	v2.4s, v2.4s, v6.4s			\n"	\
 	"fmax	v18.4s, v18.4s, v22.4s			\n"	\
 	"fmax	v2.4s, v2.4s, v18.4s			\n"	\
 	"fmaxv	"TMPF0", v2.4s				\n"	\
 	"fcmp	"MAXF", "TMPF0"				\n"	\
 	"fcsel	"MAXF", "MAXF", "TMPF0", ge		\n"	\
 	"csel	"INDEX", "INDEX", "Z", ge		\n"	\
 	"add	"Z", "Z", #"N_KERNEL_SIZE"		\n"

 #else

 #define KERNEL_F						\
 	"ldp	q2, q3, ["X"]				\n"	\
 	"ldp	q4, q5, ["X", #32]			\n"	\
 	"ldp	q6, q7, ["X", #64]			\n"	\
 	"ldp	q16, q17, ["X", #96]			\n"	\
 	"ldp	q18, q19, ["X", #128]			\n"	\
 	"ldp	q20, q21, ["X", #160]			\n"	\
 	"ldp	q22, q23, ["X", #192]			\n"	\
 	"ldp	q24, q25, ["X", #224]			\n"	\
 	"add	"X", "X", #256				\n"	\
 	"fabs	v2.2d, v2.2d				\n"	\
 	"fabs	v3.2d, v3.2d				\n"	\
 	"fabs	v4.2d, v4.2d				\n"	\
 	"fabs	v5.2d, v5.2d				\n"	\
 	"fabs	v6.2d, v6.2d				\n"	\
 	"fabs	v7.2d, v7.2d				\n"	\
 	"fabs	v16.2d, v16.2d				\n"	\
 	"fabs	v17.2d, v17.2d				\n"	\
 	"fabs	v18.2d, v18.2d				\n"	\
 	"fabs	v19.2d, v19.2d				\n"	\
 	"fabs	v20.2d, v20.2d				\n"	\
 	"fabs	v21.2d, v21.2d				\n"	\
 	"fabs	v22.2d, v22.2d				\n"	\
 	"fabs	v23.2d, v23.2d				\n"	\
 	"fabs	v24.2d, v24.2d				\n"	\
 	"fabs	v25.2d, v25.2d				\n"	\
 	"faddp	v2.2d, v2.2d, v3.2d			\n"	\
 	"faddp	v4.2d, v4.2d, v5.2d			\n"	\
 	"faddp	v6.2d, v6.2d, v7.2d			\n"	\
 	"faddp	v16.2d, v16.2d, v17.2d			\n"	\
 	"faddp	v18.2d, v18.2d, v19.2d			\n"	\
 	"faddp	v20.2d, v20.2d, v21.2d			\n"	\
 	"faddp	v22.2d, v22.2d, v23.2d			\n"	\
 	"faddp	v24.2d, v24.2d, v25.2d			\n"	\
 	"fmax	v2.2d, v2.2d, v4.2d			\n"	\
 	"fmax	v6.2d, v6.2d, v16.2d			\n"	\
 	"fmax	v18.2d, v18.2d, v20.2d			\n"	\
 	"fmax	v22.2d, v22.2d, v24.2d			\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024]			\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024+64]		\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024+128]		\n"	\
 	"PRFM	PLDL1KEEP, ["X", #1024+192]		\n"	\
 	"fmax	v2.2d, v2.2d, v6.2d			\n"	\
 	"fmax	v18.2d, v18.2d, v22.2d			\n"	\
 	"fmax	v2.2d, v2.2d, v18.2d			\n"	\
 	"ins	v3.d[0], v2.d[1]			\n"	\
 	"fmax	"TMPF0", d3, d2				\n"	\
 	"fcmp	"MAXF", "TMPF0"				\n"	\
 	"fcsel	"MAXF", "MAXF", "TMPF0", ge		\n"	\
 	"csel	"INDEX", "INDEX", "Z", ge		\n"	\
 	"add	"Z", "Z", #"N_KERNEL_SIZE"		\n"
 #endif

 #define KERNEL_F_FINALIZE					\
 	"sub	x6, "INDEX", #1				\n"	\
 	"lsl	x6, x6, #"INC_SHIFT" 			\n"	\
 	"add	x7, x7, x6				\n"	\
 	"mov	x6, #0					\n"	\
 	"1:						\n"	\
 	"add	x6, x6, #1				\n"	\
 	"cmp	x6, #"N_KERNEL_SIZE"			\n"	\
 	"bge	2f					\n"	\
 	"ldr	"TMPF1", [x7] 				\n"	\
 	"fabs	"TMPF1V", "TMPF1V"			\n"	\
 	"faddp	"TMPF0V", "TMPF1V", "TMPF1V"		\n"	\
 	"fcmp	"MAXF", "TMPF0"				\n"	\
 	"add	x7, x7, #"SZ"				\n"	\
 	"bne	1b					\n"	\
 	"2:						\n"	\
 	"sub	x6, x6, #1				\n"	\
 	"add	"INDEX", "INDEX", x6			\n"


 #define INIT							\
 	"lsl	"INC_X", "INC_X", #"INC_SHIFT"		\n"	\
 	"ldr	"TMPF1", ["X"] 				\n"	\
 	"fabs	"TMPF1V", "TMPF1V"			\n"	\
 	"faddp	"TMPF0V", "TMPF1V", "TMPF1V"		\n"	\
 	"fmov	"MAXF" , "TMPF0"			\n"	\
 	"add	"X", "X", "INC_X"			\n"	\
 	"mov	"Z", #1					\n"	\
 	"mov	"INDEX", "Z"				\n"	\
 	"fabs	"MAXF", "MAXF"				\n"


 #define KERNEL_S1						\
 	"ldr	"TMPF1", ["X"]				\n"	\
 	"add	"X", "X", "INC_X"			\n"	\
 	"add	"Z", "Z", #1				\n"	\
 	"fabs	"TMPF1V", "TMPF1V"			\n"	\
 	"faddp	"TMPF0V", "TMPF1V", "TMPF1V"		\n"	\
 	"fcmp	"MAXF", "TMPF0"				\n"	\
 	"fcsel	"MAXF", "MAXF", "TMPF0", ge		\n"	\
 	"csel	"INDEX", "INDEX", "Z", ge		\n"


 #if defined(SMP)
 extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
 	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
 	void *c, BLASLONG ldc, int (*function)(), int nthreads);
 #endif


 static BLASLONG izamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG index = 0;

 	if ( n < 0 )  return index;

 	__asm__ __volatile__ (
 	"	mov	"N", %[N_]				\n"
 	"	mov	"X", %[X_]				\n"
 	"	mov	"INC_X", %[INCX_]			\n"

 	"	cmp	"N", xzr				\n"
 	"	ble	.Lizamax_kernel_zero			\n"
 	"	cmp	"INC_X", xzr				\n"
 	"	ble	.Lizamax_kernel_zero			\n"
 	"	cmp	"INC_X", #1				\n"
 	"	bne	.Lizamax_kernel_S_BEGIN			\n"
 	"	mov	x7, "X"					\n"

 	".Lizamax_kernel_F_BEGIN:				\n"
 	"	"INIT"						\n"
 	"	subs	"N", "N", #1				\n"
 	"	ble	.Lizamax_kernel_L999			\n"
 	"	asr	"J", "N", #"N_DIV_SHIFT"		\n"
 	"	cmp	"J", xzr				\n"
 	"	beq	.Lizamax_kernel_F1			\n"
 	"	add	"Z", "Z", #1				\n"

 	".Lizamax_kernel_F:					\n"
 	"	"KERNEL_F"					\n"
 	"	subs	"J", "J", #1				\n"
 	"	bne	.Lizamax_kernel_F			\n"
 	"	"KERNEL_F_FINALIZE"				\n"
 	"	sub	"Z", "Z", #1				\n"

 	".Lizamax_kernel_F1:					\n"
 	"	ands	"J", "N", #"N_REM_MASK"			\n"
 	"	ble	.Lizamax_kernel_L999			\n"

 	".Lizamax_kernel_F10:					\n"
 	"	"KERNEL_S1"					\n"
 	"	subs	"J", "J", #1				\n"
 	"	bne	.Lizamax_kernel_F10			\n"
 	"	b	.Lizamax_kernel_L999			\n"

 	".Lizamax_kernel_S_BEGIN:				\n"
 	"	"INIT"						\n"
 	"	subs	"N", "N", #1				\n"
 	"	ble	.Lizamax_kernel_L999			\n"
 	"	asr	"J", "N", #2				\n"
 	"	cmp	"J", xzr				\n"
 	"	ble	.Lizamax_kernel_S1			\n"

 	".Lizamax_kernel_S4:					\n"
 	"	"KERNEL_S1"					\n"
 	"	"KERNEL_S1"					\n"
 	"	"KERNEL_S1"					\n"
 	"	"KERNEL_S1"					\n"
 	"	subs	"J", "J", #1				\n"
 	"	bne	.Lizamax_kernel_S4			\n"

 	".Lizamax_kernel_S1:					\n"
 	"	ands	"J", "N", #3				\n"
 	"	ble	.Lizamax_kernel_L999			\n"

 	".Lizamax_kernel_S10:					\n"
 	"	"KERNEL_S1"					\n"
 	"	subs	"J", "J", #1				\n"
 	"	bne	.Lizamax_kernel_S10			\n"

 	".Lizamax_kernel_L999:					\n"
 	"	mov	x0, "INDEX"				\n"
 	"	b	.Lizamax_kernel_DONE			\n"

 	".Lizamax_kernel_zero:					\n"
 	"	mov	x0, xzr					\n"

 	".Lizamax_kernel_DONE:					\n"
 	"	mov	%[INDEX_], "INDEX"			\n"

 	: [INDEX_] "=r" (index)		//%0
 	: [N_]    "r"  (n),		//%1
 	  [X_]    "r"  (x),		//%2
 	  [INCX_] "r"  (inc_x)		//%3
 	: "cc",
 	  "memory",
 	  "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
 	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
 	);

 	return index;
 }

 #if defined(SMP)
 static int izamax_thread_function(BLASLONG n, BLASLONG dummy0,
 	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
 	BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
 {
 	*(BLASLONG *)result = izamax_compute(n, x, inc_x);

 	return 0;
 }
 #endif

 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 #if defined(SMP)
 	int nthreads;
 	FLOAT dummy_alpha[2];
 #endif
 	BLASLONG max_index = 0;

 #if defined(SMP)
 	nthreads = num_cpu_avail(1);

 	if (inc_x == 0)
 		nthreads = 1;

 	if (n <= 10000)
 		nthreads = 1;

 	if (nthreads == 1) {
 		max_index = izamax_compute(n, x, inc_x);
 	} else {
 		BLASLONG i, width, cur_index;
 		int num_cpu;
 		int mode;
 		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
 		FLOAT max = -1.0;

 #if !defined(DOUBLE)
 		mode = BLAS_SINGLE | BLAS_COMPLEX;
 #else
 		mode = BLAS_DOUBLE | BLAS_COMPLEX;
 #endif

 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
 				   x, inc_x, NULL, 0, result, 0,
 				   ( void *)izamax_thread_function, nthreads);

 		num_cpu = 0;
 		i = n;
 		cur_index = 0;

 		while (i > 0) {
 			FLOAT elem_r, elem_i;
 			BLASLONG cur_max_index;

 			cur_max_index = *(BLASLONG *)&result[num_cpu * sizeof(double) * 2];
 			elem_r = x[((cur_index + cur_max_index - 1) * inc_x * 2) + 0];
 			elem_i = x[((cur_index + cur_max_index - 1) * inc_x * 2) + 1];
 			elem_r = fabs(elem_r) + fabs(elem_i);

 			if (elem_r >= max) {
 				max = elem_r;
 				max_index = cur_index + cur_max_index;
 			}

 			width = blas_quickdivide(i + nthreads - num_cpu - 1,
 				 nthreads - num_cpu);
 			i -= width;
 			cur_index += width;
 			num_cpu ++;
 		}
 	}
 #else
 	max_index = izamax_compute(n, x, inc_x);
 #endif

 	return max_index;
 }
--- a/kernel/arm64/scnrm2_thunderx2t99.c
+++ b/kernel/arm64/scnrm2_thunderx2t99.c
@@ -0,0 +1,355 @@
 /***************************************************************************
 Copyright (c) 2017, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/


 #include "common.h"

 #include <arm_neon.h>

 #if defined(SMP)
 extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
 	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
 	void *c, BLASLONG ldc, int (*function)(), int nthreads);
 #endif

 #if !defined(COMPLEX)
 #define N		"x0"	/* vector length */
 #define X		"x1"	/* X vector address */
 #define INC_X		"x2"	/* X stride */
 #define J		"x5"	/* loop variable */

 #define TMPF		"s16"
 #define TMPFD		"d17"
 #define SSQD		"d0"

 #define N_DIV_SHIFT	"6"
 #define N_REM_MASK	"63"
 #define INC_SHIFT	"2"

 #define KERNEL_F1					\
 	"ldr	"TMPF", ["X"], #4		\n"	\
 	"fcvt	"TMPFD", "TMPF"			\n"	\
 	"fmadd	"SSQD", "TMPFD", "TMPFD", "SSQD"\n"

 #define KERNEL_F					\
 	KERNEL_F32					\
 	KERNEL_F32

 #define KERNEL_F32					\
 	"ldur	q16, ["X"]			\n"	\
 	"ldur	q18, ["X", #16]			\n"	\
 	"ldur	q20, ["X", #32]			\n"	\
 	"ldur	q22, ["X", #48]			\n"	\
 	"ldur	q24, ["X", #64]			\n"	\
 	"ldur	q26, ["X", #80]			\n"	\
 	"ldur	q28, ["X", #96]			\n"	\
 	"ldur	q30, ["X", #112]		\n"	\
 	"add	"X", "X", #128			\n"	\
 	"fcvtl2	v17.2d, v16.4s			\n"	\
 	"fcvtl	v16.2d, v16.2s			\n"	\
 	"fcvtl2	v19.2d, v18.4s			\n"	\
 	"fcvtl	v18.2d, v18.2s			\n"	\
 	"fcvtl2	v21.2d, v20.4s			\n"	\
 	"fcvtl	v20.2d, v20.2s			\n"	\
 	"fcvtl2	v23.2d, v22.4s			\n"	\
 	"fcvtl	v22.2d, v22.2s			\n"	\
 	"fcvtl2	v25.2d, v24.4s			\n"	\
 	"fcvtl	v24.2d, v24.2s			\n"	\
 	"fcvtl2	v27.2d, v26.4s			\n"	\
 	"fcvtl	v26.2d, v26.2s			\n"	\
 	"fcvtl2	v29.2d, v28.4s			\n"	\
 	"fcvtl	v28.2d, v28.2s			\n"	\
 	"fcvtl2	v31.2d, v30.4s			\n"	\
 	"fcvtl	v30.2d, v30.2s			\n"	\
 	"fmla	v0.2d, v16.2d, v16.2d		\n"	\
 	"fmla	v1.2d, v17.2d, v17.2d		\n"	\
 	"fmla	v2.2d, v18.2d, v18.2d		\n"	\
 	"fmla	v3.2d, v19.2d, v19.2d		\n"	\
 	"fmla	v4.2d, v20.2d, v20.2d		\n"	\
 	"fmla	v5.2d, v21.2d, v21.2d		\n"	\
 	"fmla	v6.2d, v22.2d, v22.2d		\n"	\
 	"fmla	v7.2d, v23.2d, v23.2d		\n"	\
 	"fmla	v0.2d, v24.2d, v24.2d		\n"	\
 	"fmla	v1.2d, v25.2d, v25.2d		\n"	\
 	"fmla	v2.2d, v26.2d, v26.2d		\n"	\
 	"fmla	v3.2d, v27.2d, v27.2d		\n"	\
 	"fmla	v4.2d, v28.2d, v28.2d		\n"	\
 	"fmla	v5.2d, v29.2d, v29.2d		\n"	\
 	"fmla	v6.2d, v30.2d, v30.2d		\n"	\
 	"fmla	v7.2d, v31.2d, v31.2d		\n"	\
 	"prfm	PLDL1KEEP, ["X", #1024]		\n"	\
 	"prfm	PLDL1KEEP, ["X", #1024+64]	\n"

 #define KERNEL_F_FINALIZE				\
 	"fadd	v0.2d, v0.2d, v1.2d		\n"	\
 	"fadd	v2.2d, v2.2d, v3.2d		\n"	\
 	"fadd	v4.2d, v4.2d, v5.2d		\n"	\
 	"fadd	v6.2d, v6.2d, v7.2d		\n"	\
 	"fadd	v0.2d, v0.2d, v2.2d		\n"	\
 	"fadd	v4.2d, v4.2d, v6.2d		\n"	\
 	"fadd	v0.2d, v0.2d, v4.2d		\n"	\
 	"faddp	"SSQD", v0.2d			\n"

 #define KERNEL_S1					\
 	"ldr	"TMPF", ["X"]			\n"	\
 	"add	"X", "X", "INC_X"		\n"	\
 	"fcvt	"TMPFD", "TMPF"			\n"	\
 	"fmadd	"SSQD", "TMPFD", "TMPFD", "SSQD"\n"

 #define KERNEL_FINALIZE					\
 	""

 #else

 #define N		"x0"	/* vector length */
 #define X		"x1"	/* X vector address */
 #define INC_X		"x2"	/* X stride */
 #define J		"x5"	/* loop variable */

 #define TMPF		"d16"
 #define SSQD		"d0"

 #define N_DIV_SHIFT	"4"
 #define N_REM_MASK	"15"
 #define INC_SHIFT	"3"

 #define KERNEL_F1					\
 	"ldr	"TMPF", ["X"]			\n"	\
 	"add	"X", "X", #8			\n"	\
 	"fcvtl	v16.2d, v16.2s			\n"	\
 	"fmla	v0.2d, v16.2d, v16.2d		\n"

 #define KERNEL_F					\
 	"ldur	q16, ["X"]			\n"	\
 	"ldur	q18, ["X", #16]			\n"	\
 	"ldur	q20, ["X", #32]			\n"	\
 	"ldur	q22, ["X", #48]			\n"	\
 	"ldur	q24, ["X", #64]			\n"	\
 	"ldur	q26, ["X", #80]			\n"	\
 	"ldur	q28, ["X", #96]			\n"	\
 	"ldur	q30, ["X", #112]		\n"	\
 	"add	"X", "X", #128			\n"	\
 	"fcvtl2	v17.2d, v16.4s			\n"	\
 	"fcvtl	v16.2d, v16.2s			\n"	\
 	"fcvtl2	v19.2d, v18.4s			\n"	\
 	"fcvtl	v18.2d, v18.2s			\n"	\
 	"fcvtl2	v21.2d, v20.4s			\n"	\
 	"fcvtl	v20.2d, v20.2s			\n"	\
 	"fcvtl2	v23.2d, v22.4s			\n"	\
 	"fcvtl	v22.2d, v22.2s			\n"	\
 	"fcvtl2	v25.2d, v24.4s			\n"	\
 	"fcvtl	v24.2d, v24.2s			\n"	\
 	"fcvtl2	v27.2d, v26.4s			\n"	\
 	"fcvtl	v26.2d, v26.2s			\n"	\
 	"fcvtl2	v29.2d, v28.4s			\n"	\
 	"fcvtl	v28.2d, v28.2s			\n"	\
 	"fcvtl2	v31.2d, v30.4s			\n"	\
 	"fcvtl	v30.2d, v30.2s			\n"	\
 	"fmla	v0.2d, v16.2d, v16.2d		\n"	\
 	"fmla	v1.2d, v17.2d, v17.2d		\n"	\
 	"fmla	v2.2d, v18.2d, v18.2d		\n"	\
 	"fmla	v3.2d, v19.2d, v19.2d		\n"	\
 	"fmla	v4.2d, v20.2d, v20.2d		\n"	\
 	"fmla	v5.2d, v21.2d, v21.2d		\n"	\
 	"fmla	v6.2d, v22.2d, v22.2d		\n"	\
 	"fmla	v7.2d, v23.2d, v23.2d		\n"	\
 	"fmla	v0.2d, v24.2d, v24.2d		\n"	\
 	"fmla	v1.2d, v25.2d, v25.2d		\n"	\
 	"fmla	v2.2d, v26.2d, v26.2d		\n"	\
 	"fmla	v3.2d, v27.2d, v27.2d		\n"	\
 	"fmla	v4.2d, v28.2d, v28.2d		\n"	\
 	"fmla	v5.2d, v29.2d, v29.2d		\n"	\
 	"fmla	v6.2d, v30.2d, v30.2d		\n"	\
 	"fmla	v7.2d, v31.2d, v31.2d		\n"	\
 	"prfm	PLDL1KEEP, ["X", #1024]		\n"	\
 	"prfm	PLDL1KEEP, ["X", #1024+64]	\n"

 #define KERNEL_F_FINALIZE				\
 	"fadd	v0.2d, v0.2d, v1.2d		\n"	\
 	"fadd	v2.2d, v2.2d, v3.2d		\n"	\
 	"fadd	v4.2d, v4.2d, v5.2d		\n"	\
 	"fadd	v6.2d, v6.2d, v7.2d		\n"	\
 	"fadd	v0.2d, v0.2d, v2.2d		\n"	\
 	"fadd	v4.2d, v4.2d, v6.2d		\n"	\
 	"fadd	v0.2d, v0.2d, v4.2d		\n"

 #define KERNEL_FINALIZE					\
 	"faddp	"SSQD", v0.2d			\n"

 #define KERNEL_S1					\
 	"ldr	"TMPF", ["X"]			\n"	\
 	"add	"X", "X", "INC_X"		\n"	\
 	"fcvtl	v16.2d, v16.2s			\n"	\
 	"fmla	v0.2d, v16.2d, v16.2d		\n"
 #endif


 static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	double  ret = 0.0 ;

 	if (n <= 0)  return ret;

 	__asm__ __volatile__ (
 	"	mov	"N", %[N_]			\n"
 	"	mov	"X", %[X_]			\n"
 	"	mov	"INC_X", %[INCX_]		\n"
 	"	fmov	"SSQD", xzr			\n"
 	"	fmov	d1, xzr				\n"
 	"	fmov	d2, xzr				\n"
 	"	fmov	d3, xzr				\n"
 	"	fmov	d4, xzr				\n"
 	"	fmov	d5, xzr				\n"
 	"	fmov	d6, xzr				\n"
 	"	fmov	d7, xzr				\n"
 	"	cmp	"N", xzr			\n"
 	"	ble	.Lnrm2_kernel_L999		\n"
 	"	cmp	"INC_X", xzr			\n"
 	"	ble	.Lnrm2_kernel_L999		\n"
 	"	cmp	"INC_X", #1			\n"
 	"	bne	.Lnrm2_kernel_S_BEGIN		\n"

 	".Lnrm2_kernel_F_BEGIN:				\n"
 	"	asr	"J", "N", #"N_DIV_SHIFT"	\n"
 	"	cmp	"J", xzr			\n"
 	"	beq	.Lnrm2_kernel_S_BEGIN		\n"

 	"	.align 5				\n"
 	".Lnrm2_kernel_F:				\n"
 	"	"KERNEL_F"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Lnrm2_kernel_F			\n"
 	"	"KERNEL_F_FINALIZE"			\n"

 	".Lnrm2_kernel_F1:				\n"
 	"	ands	"J", "N", #"N_REM_MASK"		\n"
 	"	ble	.Lnrm2_kernel_L999		\n"

 	".Lnrm2_kernel_F10:				\n"
 	"	"KERNEL_F1"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Lnrm2_kernel_F10		\n"
 	"	b	.Lnrm2_kernel_L999		\n"

 	".Lnrm2_kernel_S_BEGIN:				\n"
 	"	lsl	"INC_X", "INC_X", #"INC_SHIFT"	\n"
 	"	asr	"J", "N", #2			\n"
 	"	cmp	"J", xzr			\n"
 	"	ble	.Lnrm2_kernel_S1		\n"

 	".Lnrm2_kernel_S4:				\n"
 	"	"KERNEL_S1"				\n"
 	"	"KERNEL_S1"				\n"
 	"	"KERNEL_S1"				\n"
 	"	"KERNEL_S1"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Lnrm2_kernel_S4		\n"

 	".Lnrm2_kernel_S1:				\n"
 	"	ands	"J", "N", #3			\n"
 	"	ble	.Lnrm2_kernel_L999		\n"

 	".Lnrm2_kernel_S10:				\n"
 	"	"KERNEL_S1"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Lnrm2_kernel_S10		\n"

 	".Lnrm2_kernel_L999:				\n"
 	"	"KERNEL_FINALIZE"			\n"
 	"	fmov	%[RET_], "SSQD"			\n"

 	: [RET_]  "=r" (ret)		//%0
 	: [N_]    "r"  (n),		//%1
 	  [X_]    "r"  (x),		//%2
 	  [INCX_] "r"  (inc_x)		//%3
 	: "cc",
 	  "memory",
 	  "x0", "x1", "x2", "x3", "x4", "x5",
 	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
 	);

 	return ret;
 }

 #if defined(SMP)
 static int nrm2_thread_function(BLASLONG n, BLASLONG dummy0,
 	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3,
 	BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
 {
 	*(double *)result = nrm2_compute(n, x, inc_x);

 	return 0;
 }
 #endif

 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 #if defined(SMP)
 	int nthreads;
 	FLOAT dummy_alpha[2];
 #endif
 	FLOAT nrm2 = 0.0;
 	double nrm2_double = 0.0;

 	if (n <= 0 || inc_x <= 0) return 0.0;

 #if defined(SMP)
 	nthreads = num_cpu_avail(1);

 	if (n <= 10000)
 		nthreads = 1;

 	if (nthreads == 1) {
 		nrm2_double = nrm2_compute(n, x, inc_x);
 	} else {
 		int mode, i;
 		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
 		double *ptr;

 #if !defined(COMPLEX)
 		mode = BLAS_SINGLE  | BLAS_REAL;
 #else
 		mode = BLAS_SINGLE  | BLAS_COMPLEX;
 #endif

 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
 				   x, inc_x, NULL, 0, result, 0,
 				   ( void *)nrm2_thread_function, nthreads);

 		ptr = (double *)result;
 		for (i = 0; i < nthreads; i++) {
 			nrm2_double = nrm2_double + (*ptr);
 			ptr = (double *)(((char *)ptr) + sizeof(double) * 2);
 		}
 	}
 #else
 	nrm2_double = nrm2_compute(n, x, inc_x);
 #endif
 	nrm2 = sqrt(nrm2_double);

 	return nrm2;
 }
--- a/kernel/arm64/swap_thunderx2t99.S
+++ b/kernel/arm64/swap_thunderx2t99.S
@@ -1,5 +1,5 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 Copyright (c) 2017, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -29,61 +29,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"

 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */
 #define	X	x3	/* X vector address */
 #define	INC_X	x4	/* X stride */
 #define	Y	x5	/* Y vector address */
 #define	INC_Y	x6	/* Y stride */
 #define I	x1	/* loop variable */

 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/

 #define TMPF	s6
 #define SSQ	s0
 #define TMPVF	{v6.s}[0]
 #define SZ	4

 /******************************************************************************/

 .macro INIT_F1
 	ldr	TMPF, [X], #SZ
 	fmul	SSQ, TMPF, TMPF
 .endm
 #if !defined(COMPLEX)
 #if !defined(DOUBLE)
 #define TMPF0		s0
 #define TMPF1		s1
 #define INC_SHIFT	2
 #define N_DIV_SHIFT	2
 #define N_REM_MASK	3
 #else
 #define TMPF0		d0
 #define TMPF1		d1
 #define INC_SHIFT	3
 #define N_DIV_SHIFT	1
 #define N_REM_MASK	1
 #endif
 #else
 #if !defined(DOUBLE)
 #define TMPF0		d0
 #define TMPF1		d1
 #define INC_SHIFT	3
 #define N_DIV_SHIFT	1
 #define N_REM_MASK	1
 #else
 #define TMPF0		q0
 #define TMPF1		q1
 #define INC_SHIFT	4
 #define N_DIV_SHIFT	0
 #define N_REM_MASK	0
 #endif
 #endif

 .macro KERNEL_F1
 	ldr	TMPF, [X], #SZ
 	fmul	TMPF, TMPF, TMPF
 	fadd	SSQ, SSQ, TMPF
 	ldr	TMPF0, [X]
 	ldr	TMPF1, [Y]
 	str	TMPF0, [Y]
 	str	TMPF1, [X]
 	add	X, X, INC_X
 	add	Y, Y, INC_Y
 .endm

 .macro INIT_F4
 	ld1	{v1.4s}, [X], #16
 	fmul	v1.4s, v1.4s, v1.4s
 	ext	v2.16b, v1.16b, v1.16b, #8
 	fadd	v2.2s, v1.2s, v2.2s
 	faddp	SSQ, v2.2s
 .endm
 .macro KERNEL_F
 	ldr	q0, [X]
 	ldr	q1, [Y]
 	add	X, X, #16
 	add	Y, Y, #16

 .macro KERNEL_F4
 	ld1	{v1.4s}, [X], #16
 	fmul	v1.4s, v1.4s, v1.4s
 	ext	v2.16b, v1.16b, v1.16b, #8
 	fadd	v2.2s, v1.2s, v2.2s
 	faddp	TMPF, v2.2s
 	fadd	SSQ, SSQ, TMPF
 .endm
 	prfm	PLDL1STRM, [X, #1024]
 	prfm	PLDL1STRM, [Y, #1024]

 .macro INIT_S
 	lsl	INC_X, INC_X, #2
 	ld1	TMPVF, [X], INC_X
 	fmul	SSQ, TMPF, TMPF
 	str	q0, [Y, #-16]
 	str	q1, [X, #-16]
 .endm

 .macro KERNEL_S1
 	ld1	TMPVF, [X], INC_X
 	fmul	TMPF, TMPF, TMPF
 	fadd	SSQ, SSQ, TMPF
 .macro INIT
 	lsl	INC_X, INC_X, #INC_SHIFT
 	lsl	INC_Y, INC_Y, #INC_SHIFT
 .endm


 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
@@ -91,88 +104,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	PROLOGUE

 	cmp	N, xzr
 	ble	nrm2_kernel_zero
 	cmp	INC_X, xzr
 	ble	nrm2_kernel_zero
 	ble	.Lswap_kernel_L999

 	cmp	INC_X, #1
 	bne	nrm2_kernel_S_BEGIN
 	bne	.Lswap_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	.Lswap_kernel_S_BEGIN

 nrm2_kernel_F_BEGIN:
 .Lswap_kernel_F_BEGIN:
 	INIT

 	asr	I, N, #2
 	asr	I, N, #N_DIV_SHIFT
 	cmp	I, xzr
 	beq	nrm2_kernel_F1_INIT
 	beq	.Lswap_kernel_F1

 	INIT_F4
 	subs	I, I, #1
 	beq	nrm2_kernel_F1
 	.align 5
 .Lswap_kernel_F:

 nrm2_kernel_F4:

 	KERNEL_F4
 	KERNEL_F

 	subs	I, I, #1
 	bne	nrm2_kernel_F4
 	bne	.Lswap_kernel_F

 nrm2_kernel_F1:
 .Lswap_kernel_F1:

 	ands	I, N, #3
 	ble	nrm2_kernel_L999
 #if defined(DOUBLE) && defined(COMPLEX)
 	b	.Lswap_kernel_L999
 #else
 	ands	I, N, #N_REM_MASK
 	ble	.Lswap_kernel_L999
 #endif

 nrm2_kernel_F10:
 .Lswap_kernel_F10:

 	KERNEL_F1

 	subs    I, I, #1
        bne     nrm2_kernel_F10
 	bne     .Lswap_kernel_F10

 	b	nrm2_kernel_L999
 	b	.Lswap_kernel_L999

 nrm2_kernel_F1_INIT:
 	INIT_F1
 	subs	N, N, #1
 	b	nrm2_kernel_F1

 nrm2_kernel_S_BEGIN:
 .Lswap_kernel_S_BEGIN:

 	INIT_S

 	subs	N, N, #1
 	ble	nrm2_kernel_L999
 	INIT

 	asr	I, N, #2
 	cmp	I, xzr
 	ble	nrm2_kernel_S1
 	ble	.Lswap_kernel_S1

 nrm2_kernel_S4:
 .Lswap_kernel_S4:

 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1

 	subs	I, I, #1
 	bne	nrm2_kernel_S4
 	bne	.Lswap_kernel_S4

 nrm2_kernel_S1:
 .Lswap_kernel_S1:

 	ands	I, N, #3
 	ble	nrm2_kernel_L999
 	ble	.Lswap_kernel_L999

 nrm2_kernel_S10:
 .Lswap_kernel_S10:

 	KERNEL_S1

 	subs    I, I, #1
 	bne     nrm2_kernel_S10
 	KERNEL_F1

 nrm2_kernel_L999:
 	fsqrt	SSQ, SSQ
 	ret
 	subs	I, I, #1
 	bne	.Lswap_kernel_S10

 nrm2_kernel_zero:
 	fmov	SSQ, wzr
 .Lswap_kernel_L999:

 	mov	w0, wzr
 	ret

 	EPILOGUE
--- a/kernel/arm64/zdot_thunderx2t99.c
+++ b/kernel/arm64/zdot_thunderx2t99.c
@@ -0,0 +1,357 @@
 /***************************************************************************
 Copyright (c) 2017, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/


 #include "common.h"

 #include <arm_neon.h>

 #define N		"x0"	/* vector length */
 #define X		"x1"	/* "X" vector address */
 #define INC_X		"x2"	/* "X" stride */
 #define Y		"x3"	/* "Y" vector address */
 #define INC_Y		"x4"	/* "Y" stride */
 #define J		"x5"	/* loop variable */

 #if !defined(DOUBLE)
 #define REG0		"wzr"
 #define DOTF		"s0"
 #define DOTI		"s1"
 #define INC_SHIFT	"3"
 #define N_DIV_SHIFT	"4"
 #define N_REM_MASK	"15"
 #else
 #define REG0		"xzr"
 #define DOTF		"d0"
 #define DOTI		"d1"
 #define INC_SHIFT	"4"
 #define N_DIV_SHIFT	"3"
 #define N_REM_MASK	"7"
 #endif

 #if !defined(CONJ)
 #define f_ii		"fmls"
 #define f_ir		"fmla"
 #define a_ii		"fsub"
 #define a_ir		"fadd"
 #else
 #define f_ii		"fmla"
 #define f_ir		"fmls"
 #define a_ii		"fadd"
 #define a_ir		"fsub"
 #endif

 #if !defined(DOUBLE)
 #define KERNEL_F1						\
 	"	ldr	d16, ["X"]			\n"	\
 	"	ldr	d24, ["Y"]			\n"	\
 	"	add	"X", "X", "INC_X"		\n"	\
 	"	add	"Y", "Y", "INC_Y"		\n"	\
 	"	ins	v17.s[0], v16.s[1]		\n"	\
 	"	fmla	"DOTF", s16, v24.s[0]		\n"	\
 	"	"f_ii"	"DOTF", s17, v24.s[1]		\n"	\
 	"	"f_ir"	"DOTI", s17, v24.s[0]		\n"	\
 	"	fmla	"DOTI", s16, v24.s[1]		\n"

 #define KERNEL_F						\
 	"	ld2	{v16.4s, v17.4s}, ["X"]		\n"	\
 	"	ld2	{v24.4s, v25.4s}, ["Y"]		\n"	\
 	"	add	"X", "X", #32			\n"	\
 	"	add	"Y", "Y", #32			\n"	\
 	"	ld2	{v18.4s, v19.4s}, ["X"]		\n"	\
 	"	ld2	{v26.4s, v27.4s}, ["Y"]		\n"	\
 	"	add	"X", "X", #32			\n"	\
 	"	add	"Y", "Y", #32			\n"	\
 	"	fmla	v0.4s, v16.4s, v24.4s		\n"	\
 	"	fmla	v1.4s, v17.4s, v25.4s		\n"	\
 	"	fmla	v2.4s, v16.4s, v25.4s		\n"	\
 	"	fmla	v3.4s, v17.4s, v24.4s		\n"	\
 	"	ld2	{v20.4s, v21.4s}, ["X"]		\n"	\
 	"	ld2	{v28.4s, v29.4s}, ["Y"]		\n"	\
 	"	add	"X", "X", #32			\n"	\
 	"	add	"Y", "Y", #32			\n"	\
 	"	fmla	v4.4s, v18.4s, v26.4s		\n"	\
 	"	fmla	v5.4s, v19.4s, v27.4s		\n"	\
 	"	fmla	v6.4s, v18.4s, v27.4s		\n"	\
 	"	fmla	v7.4s, v19.4s, v26.4s		\n"	\
 	"	ld2	{v22.4s, v23.4s}, ["X"]		\n"	\
 	"	ld2	{v30.4s, v31.4s}, ["Y"]		\n"	\
 	"	fmla	v0.4s, v20.4s, v28.4s		\n"	\
 	"	fmla	v1.4s, v21.4s, v29.4s		\n"	\
 	"	fmla	v2.4s, v20.4s, v29.4s		\n"	\
 	"	fmla	v3.4s, v21.4s, v28.4s		\n"	\
 	"	add	"X", "X", #32			\n"	\
 	"	add	"Y", "Y", #32			\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #1024]		\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #1024]		\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #1024+64]	\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #1024+64]	\n"	\
 	"	fmla	v4.4s, v22.4s, v30.4s		\n"	\
 	"	fmla	v5.4s, v23.4s, v31.4s		\n"	\
 	"	fmla	v6.4s, v22.4s, v31.4s		\n"	\
 	"	fmla	v7.4s, v23.4s, v30.4s		\n"

 #define KERNEL_F_FINALIZE					\
 	"	fadd	v0.4s, v0.4s, v4.4s		\n"	\
 	"	fadd	v1.4s, v1.4s, v5.4s		\n"	\
 	"	fadd	v2.4s, v2.4s, v6.4s		\n"	\
 	"	fadd	v3.4s, v3.4s, v7.4s		\n"	\
 	"	"a_ii"	v0.4s, v0.4s, v1.4s		\n"	\
 	"	"a_ir"	v1.4s, v2.4s, v3.4s		\n"	\
 	"	faddp	v0.4s, v0.4s, v0.4s		\n"	\
 	"	faddp	v0.4s, v0.4s, v0.4s		\n"	\
 	"	faddp	v1.4s, v1.4s, v1.4s		\n"	\
 	"	faddp	v1.4s, v1.4s, v1.4s		\n"

 #else

 #define KERNEL_F1						\
 	"	ldr	q16, ["X"]			\n"	\
 	"	ldr	q24, ["Y"]			\n"	\
 	"	add	"X", "X", "INC_X"		\n"	\
 	"	add	"Y", "Y", "INC_Y"		\n"	\
 	"	ins	v17.d[0], v16.d[1]		\n"	\
 	"	fmla	"DOTF", d16, v24.d[0]		\n"	\
 	"	"f_ii"	"DOTF", d17, v24.d[1]		\n"	\
 	"	"f_ir"	"DOTI", d17, v24.d[0]		\n"	\
 	"	fmla	"DOTI", d16, v24.d[1]		\n"

 #define KERNEL_F						\
 	"	ld2	{v16.2d, v17.2d}, ["X"]		\n"	\
 	"	ld2	{v24.2d, v25.2d}, ["Y"]		\n"	\
 	"	add	"X", "X", #32			\n"	\
 	"	add	"Y", "Y", #32			\n"	\
 	"	ld2	{v18.2d, v19.2d}, ["X"]		\n"	\
 	"	ld2	{v26.2d, v27.2d}, ["Y"]		\n"	\
 	"	add	"X", "X", #32			\n"	\
 	"	add	"Y", "Y", #32			\n"	\
 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\
 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\
 	"	fmla	v2.2d, v16.2d, v25.2d		\n"	\
 	"	fmla	v3.2d, v17.2d, v24.2d		\n"	\
 	"	ld2	{v20.2d, v21.2d}, ["X"]		\n"	\
 	"	ld2	{v28.2d, v29.2d}, ["Y"]		\n"	\
 	"	add	"X", "X", #32			\n"	\
 	"	add	"Y", "Y", #32			\n"	\
 	"	fmla	v4.2d, v18.2d, v26.2d		\n"	\
 	"	fmla	v5.2d, v19.2d, v27.2d		\n"	\
 	"	fmla	v6.2d, v18.2d, v27.2d		\n"	\
 	"	fmla	v7.2d, v19.2d, v26.2d		\n"	\
 	"	ld2	{v22.2d, v23.2d}, ["X"]		\n"	\
 	"	ld2	{v30.2d, v31.2d}, ["Y"]		\n"	\
 	"	fmla	v0.2d, v20.2d, v28.2d		\n"	\
 	"	fmla	v1.2d, v21.2d, v29.2d		\n"	\
 	"	fmla	v2.2d, v20.2d, v29.2d		\n"	\
 	"	fmla	v3.2d, v21.2d, v28.2d		\n"	\
 	"	add	"X", "X", #32			\n"	\
 	"	add	"Y", "Y", #32			\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #1024]		\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #1024]		\n"	\
 	"	PRFM	PLDL1KEEP, ["X", #1024+64]	\n"	\
 	"	PRFM	PLDL1KEEP, ["Y", #1024+64]	\n"	\
 	"	fmla	v4.2d, v22.2d, v30.2d		\n"	\
 	"	fmla	v5.2d, v23.2d, v31.2d		\n"	\
 	"	fmla	v6.2d, v22.2d, v31.2d		\n"	\
 	"	fmla	v7.2d, v23.2d, v30.2d		\n"

 #define KERNEL_F_FINALIZE					\
 	"	fadd	v0.2d, v0.2d, v4.2d		\n"	\
 	"	fadd	v1.2d, v1.2d, v5.2d		\n"	\
 	"	fadd	v2.2d, v2.2d, v6.2d		\n"	\
 	"	fadd	v3.2d, v3.2d, v7.2d		\n"	\
 	"	"a_ii"	v0.2d, v0.2d, v1.2d		\n"	\
 	"	"a_ir"	v1.2d, v2.2d, v3.2d		\n"	\
 	"	faddp	"DOTF", v0.2d			\n"	\
 	"	faddp	"DOTI", v1.2d			\n"
 #endif

 #if defined(SMP)
 extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
 	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
 	void *c, BLASLONG ldc, int (*function)(), int nthreads);
 #endif

 static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, OPENBLAS_COMPLEX_FLOAT *result)
 {
 	FLOAT dotr = 0.0, doti = 0.0;
 	CREAL(*result) = 0.0;
 	CIMAG(*result) = 0.0;

 	if ( n < 0 ) return;

 	__asm__ __volatile__ (
 	"	mov	"N", %[N_]			\n"
 	"	mov	"X", %[X_]			\n"
 	"	mov	"INC_X", %[INCX_]		\n"
 	"	mov	"Y", %[Y_]			\n"
 	"	mov	"INC_Y", %[INCY_]		\n"
 	"	fmov	"DOTF", "REG0"			\n"
 	"	fmov	"DOTI", "REG0"			\n"
 	"	fmov	d2, xzr				\n"
 	"	fmov	d3, xzr				\n"
 	"	fmov	d4, xzr				\n"
 	"	fmov	d5, xzr				\n"
 	"	fmov	d6, xzr				\n"
 	"	fmov	d7, xzr				\n"
 	"	cmp	"N", xzr			\n"
 	"	ble	.Ldot_kernel_L999		\n"
 	"	cmp	"INC_X", #1			\n"
 	"	bne	.Ldot_kernel_S_BEGIN		\n"
 	"	cmp	"INC_Y", #1			\n"
 	"	bne	.Ldot_kernel_S_BEGIN		\n"

 	".Ldot_kernel_F_BEGIN:				\n"
 	"	lsl	"INC_X", "INC_X", "INC_SHIFT"	\n"
 	"	lsl	"INC_Y", "INC_Y", "INC_SHIFT"	\n"
 	"	asr	"J", "N", #"N_DIV_SHIFT"	\n"
 	"	cmp	"J", xzr			\n"
 	"	beq	.Ldot_kernel_F1			\n"

 	"	.align 5				\n"
 	".Ldot_kernel_F:				\n"
 	"	"KERNEL_F"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Ldot_kernel_F			\n"
 	"	"KERNEL_F_FINALIZE"			\n"

 	".Ldot_kernel_F1:				\n"
 	"	ands	"J", "N", #"N_REM_MASK"		\n"
 	"	ble	.Ldot_kernel_L999		\n"

 	".Ldot_kernel_F10:				\n"
 	"	"KERNEL_F1"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Ldot_kernel_F10		\n"
 	"	b	.Ldot_kernel_L999		\n"

 	".Ldot_kernel_S_BEGIN:				\n"
 	"	lsl	"INC_X", "INC_X", "INC_SHIFT"	\n"
 	"	lsl	"INC_Y", "INC_Y", "INC_SHIFT"	\n"
 	"	asr	"J", "N", #2			\n"
 	"	cmp	"J", xzr			\n"
 	"	ble	.Ldot_kernel_S1			\n"

 	".Ldot_kernel_S4:				\n"
 	"	"KERNEL_F1"				\n"
 	"	"KERNEL_F1"				\n"
 	"	"KERNEL_F1"				\n"
 	"	"KERNEL_F1"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Ldot_kernel_S4			\n"

 	".Ldot_kernel_S1:				\n"
 	"	ands	"J", "N", #3			\n"
 	"	ble	.Ldot_kernel_L999		\n"

 	".Ldot_kernel_S10:				\n"
 	"	"KERNEL_F1"				\n"
 	"	subs	"J", "J", #1			\n"
 	"	bne	.Ldot_kernel_S10		\n"

 	".Ldot_kernel_L999:				\n"
 	"	str	"DOTF", [%[DOTR_]]		\n"
 	"	str	"DOTI", [%[DOTI_]]		\n"

 	:
 	: [DOTR_]  "r"  (&dotr),	//%0
 	  [DOTI_]  "r"  (&doti),	//%1
 	  [N_]     "r"  (n),		//%2
 	  [X_]     "r"  (x),		//%3
 	  [INCX_]  "r"  (inc_x),	//%4
 	  [Y_]     "r"  (y),		//%5
 	  [INCY_]  "r"  (inc_y)		//%6
 	: "cc",
 	  "memory",
 	  "x0", "x1", "x2", "x3", "x4", "x5",
 	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
 	);

 	CREAL(*result) = dotr;
 	CIMAG(*result) = doti;
 	return;
 }

 #if defined(SMP)
 static int zdot_thread_function(BLASLONG n, BLASLONG dummy0,
 	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
 	BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
 {
 	zdot_compute(n, x, inc_x, y, inc_y, (void *)result);

 	return 0;
 }
 #endif

 OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 #if defined(SMP)
 	int nthreads;
 	FLOAT dummy_alpha;
 #endif
 	OPENBLAS_COMPLEX_FLOAT zdot;
       CREAL(zdot) = 0.0;
       CIMAG(zdot) = 0.0;

 #if defined(SMP)
 	nthreads = num_cpu_avail(1);

 	if (inc_x == 0 || inc_y == 0)
 		nthreads = 1;

 	if (n <= 10000)
 		nthreads = 1;

 	if (nthreads == 1) {
 		zdot_compute(n, x, inc_x, y, inc_y, &zdot);
 	} else {
 		int mode, i;
 		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
 		OPENBLAS_COMPLEX_FLOAT *ptr;

 #if !defined(DOUBLE)
 		mode = BLAS_SINGLE  | BLAS_COMPLEX;
 #else
 		mode = BLAS_DOUBLE  | BLAS_COMPLEX;
 #endif

 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
 				   x, inc_x, y, inc_y, result, 0,
 				   ( void *)zdot_thread_function, nthreads);

 		ptr = (OPENBLAS_COMPLEX_FLOAT *)result;
 		for (i = 0; i < nthreads; i++) {
 			CREAL(zdot) = CREAL(zdot) + CREAL(*ptr);
 			CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr);
 			ptr = (void *)(((char *)ptr) + sizeof(double) * 2);
 		}
 	}
 #else
 	zdot_compute(n, x, inc_x, y, inc_y, &zdot);
 #endif

 	return zdot;
 }
--- a/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S
+++ b/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S
--- a/param.h
+++ b/param.h
@@ -2447,17 +2447,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_P	sgemm_p
 #define DGEMM_DEFAULT_P	dgemm_p
 #define CGEMM_DEFAULT_P cgemm_p
 #define ZGEMM_DEFAULT_P 128
 #define ZGEMM_DEFAULT_P zgemm_p

 #define SGEMM_DEFAULT_Q sgemm_q
 #define DGEMM_DEFAULT_Q dgemm_q
 #define CGEMM_DEFAULT_Q cgemm_q
 #define ZGEMM_DEFAULT_Q 512
 #define ZGEMM_DEFAULT_Q zgemm_q

 #define SGEMM_DEFAULT_R sgemm_r
 #define DGEMM_DEFAULT_R dgemm_r
 #define CGEMM_DEFAULT_R cgemm_r
 #define ZGEMM_DEFAULT_R 2048
 #define ZGEMM_DEFAULT_R zgemm_r

 #define SYMV_P	16
 #endif