added nrm2 kernel for all precisions

12 years ago · f27cabfd08
--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@@ -55,10 +55,10 @@ DDOTKERNEL   = ddot_vfpv3.S
 CDOTKERNEL   = cdot_vfpv3.S
 ZDOTKERNEL   = zdot_vfpv3.S
 SNRM2KERNEL  = ../arm/nrm2.c
 DNRM2KERNEL  = ../arm/nrm2.c
 CNRM2KERNEL  = ../arm/znrm2.c
 ZNRM2KERNEL  = ../arm/znrm2.c
 SNRM2KERNEL  = nrm2_vfpv3.S
 DNRM2KERNEL  = nrm2_vfpv3.S
 CNRM2KERNEL  = nrm2_vfpv3.S
 ZNRM2KERNEL  = nrm2_vfpv3.S
 SROTKERNEL   = rot_vfpv3.S
 DROTKERNEL   = rot_vfpv3.S
--- a/kernel/arm/nrm2_vfpv3.S
+++ b/kernel/arm/nrm2_vfpv3.S
@@ -0,0 +1,508 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 /**************************************************************************************
 * 2013/11/16 Saar
 * 	 BLASTEST 		: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *
 **************************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define STACKSIZE 256
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define I	r12
 #define X_PRE	512
 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/
 #if	!defined(COMPLEX)
 #if	defined(DOUBLE)
 .macro KERNEL_F1
 	fldmiad	X!, 	{ d4 }
 	vcmpe.f64	d4, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_NEXT_\@	
 	vabs.f64   	d4,  d4 
 	vcmpe.f64  	d0,  d4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_NEXT_\@
 	vdiv.f64	d2 , d0, d4			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d4				// scale = x
 KERNEL_F1_NEXT_\@:
 .endm
 .macro KERNEL_F8
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 .endm
 .macro KERNEL_S1
 	fldmiad	X, 	{ d4 }
 	vcmpe.f64	d4, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_NEXT	
 	vabs.f64   	d4,  d4 
 	vcmpe.f64  	d0,  d4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_NEXT
 	vdiv.f64	d2 , d0, d4			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d4				// scale = x
 KERNEL_S1_NEXT:
 	add	X, X, INC_X
 .endm
 #else
 .macro KERNEL_F1
 	fldmias	X!, 	{ s4 }
 	vcmpe.f32	s4, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_NEXT_\@	
 	vabs.f32   	s4,  s4 
 	vcmpe.f32  	s0,  s4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_NEXT_\@
 	vdiv.f32	s2 , s0, s4			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s4				// scale = x
 KERNEL_F1_NEXT_\@:
 .endm
 .macro KERNEL_F8
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 .endm
 .macro KERNEL_S1
 	fldmias	X, 	{ s4 }
 	vcmpe.f32	s4, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_NEXT	
 	vabs.f32   	s4,  s4 
 	vcmpe.f32  	s0,  s4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_NEXT
 	vdiv.f32	s2 , s0, s4			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s4				// scale = x
 KERNEL_S1_NEXT:
 	add	X, X, INC_X
 .endm
 #endif
 #else
 #if	defined(DOUBLE)
 .macro KERNEL_F1
 	fldmiad	X!, 	{ d4 - d5 }
 	vcmpe.f64	d4, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_NEXT_\@	
 	vabs.f64   	d4,  d4 
 	vcmpe.f64  	d0,  d4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_NEXT_\@
 	vdiv.f64	d2 , d0, d4			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d4				// scale = x
 KERNEL_F1_NEXT_\@:
 	vcmpe.f64	d5, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_END_\@	
 	vabs.f64   	d5,  d5 
 	vcmpe.f64  	d0,  d5				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d5, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_END_\@
 	vdiv.f64	d2 , d0, d5			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d5				// scale = x
 KERNEL_F1_END_\@:
 .endm
 .macro KERNEL_F8
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 .endm
 .macro KERNEL_S1
 	fldmiad	X, 	{ d4 - d5 }
 	vcmpe.f64	d4, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_NEXT_\@	
 	vabs.f64   	d4,  d4 
 	vcmpe.f64  	d0,  d4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d4, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_NEXT_\@
 	vdiv.f64	d2 , d0, d4			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d4				// scale = x
 KERNEL_S1_NEXT_\@:
 	vcmpe.f64	d5, d6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_END_\@	
 	vabs.f64   	d5,  d5 
 	vcmpe.f64  	d0,  d5				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f64	d2 , d5, d0			// scale >= x ?	x / scale
 	vmlage.f64	d1 , d2 , d2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_END_\@
 	vdiv.f64	d2 , d0, d5			// scale / x
 	vmul.f64	d2 , d2, d2			// ( scale / x ) * ( scale / x )
 	vmul.f64	d3 , d1, d2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f64	d1 , d3, d7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f64	d0 , d5				// scale = x
 KERNEL_S1_END_\@:
 	add	X, X, INC_X
 .endm
 #else
 .macro KERNEL_F1
 	fldmias	X!, 	{ s4 - s5 }
 	vcmpe.f32	s4, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_NEXT_\@	
 	vabs.f32   	s4,  s4 
 	vcmpe.f32  	s0,  s4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_NEXT_\@
 	vdiv.f32	s2 , s0, s4			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s4				// scale = x
 KERNEL_F1_NEXT_\@:
 	vcmpe.f32	s5, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_F1_END_\@	
 	vabs.f32   	s5,  s5 
 	vcmpe.f32  	s0,  s5				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s5, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_F1_END_\@
 	vdiv.f32	s2 , s0, s5			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s5				// scale = x
 KERNEL_F1_END_\@:
 .endm
 .macro KERNEL_F8
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	pld	[ X, #X_PRE ]
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 .endm
 .macro KERNEL_S1
 	fldmias	X, 	{ s4 - s5 }
 	vcmpe.f32	s4, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_NEXT_\@	
 	vabs.f32   	s4,  s4 
 	vcmpe.f32  	s0,  s4				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s4, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_NEXT_\@
 	vdiv.f32	s2 , s0, s4			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s4				// scale = x
 KERNEL_S1_NEXT_\@:
 	vcmpe.f32	s5, s6				// compare with 0.0
 	vmrs		APSR_nzcv, fpscr
 	beq		KERNEL_S1_END_\@	
 	vabs.f32   	s5,  s5 
 	vcmpe.f32  	s0,  s5				// compare with scale
 	vmrs		APSR_nzcv, fpscr
 	vdivge.f32	s2 , s5, s0			// scale >= x ?	x / scale
 	vmlage.f32	s1 , s2 , s2			// ssq += ( x/scale ) * ( x/scale )
 	bge		KERNEL_S1_END_\@
 	vdiv.f32	s2 , s0, s5			// scale / x
 	vmul.f32	s2 , s2, s2			// ( scale / x ) * ( scale / x )
 	vmul.f32	s3 , s1, s2			// ssq * ( scale / x ) * ( scale / x )
 	vadd.f32	s1 , s3, s7			// ssq = 1 + ssq * ( scale / x ) * ( scale / x )
 	vmov.f32	s0 , s5				// scale = x
 KERNEL_S1_END_\@:
 	add	X, X, INC_X
 .endm
 #endif
 #endif
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
 	PROLOGUE
 	.align 5
 #if defined(DOUBLE)
 	vsub.f64                d0 , d0 , d0		// scale=0.0
 	vmov.f64		d1 , #1.0		// ssq=1.0
 	vmov.f64		d7 , d1			// value 1.0 
 	vmov.f64		d6 , d0			// value 0.0 
 #else
 	vsub.f32                s0 , s0 , s0		// scale=0.0
 	vmov.f32		s1 , #1.0		// ssq=1.0
 	vmov.f32		s7 , s1			// value 1.0
 	vmov.f32		s6 , s0			// value 0.0 
 #endif
 	cmp	N, #0
 	ble	nrm2_kernel_L999
 	cmp	INC_X, #0
 	beq	nrm2_kernel_L999
 	cmp	INC_X, #1
 	bne	nrm2_kernel_S_BEGIN
 nrm2_kernel_F_BEGIN:
 	asrs	I, N, #3				// I = N / 8
 	ble	nrm2_kernel_F1
 nrm2_kernel_F8:
 	KERNEL_F8
 	subs    I, I, #1
        bne     nrm2_kernel_F8
 nrm2_kernel_F1:
 	ands    I, N, #7
        ble     nrm2_kernel_L999
 nrm2_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     nrm2_kernel_F10
 	b	nrm2_kernel_L999
 nrm2_kernel_S_BEGIN:
 #if defined(COMPLEX)
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 #endif
 #else
 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 #endif
 #endif
 nrm2_kernel_S1:
 	mov	I, N
 	.align 5
 nrm2_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     nrm2_kernel_S10
 nrm2_kernel_L999:
 #if defined(DOUBLE)
 	vsqrt.f64	d1, d1
 	vmul.f64	d0, d0, d1
 #else
 	vsqrt.f32	s1, s1
 	vmul.f32	s0, s0, s1
 #endif
 	bx	lr
 	EPILOGUE