| @@ -72,13 +72,13 @@ ZSWAPKERNEL = swap_vfp.S | |||
| SGEMVNKERNEL = gemv_n_vfp.S | |||
| DGEMVNKERNEL = gemv_n_vfp.S | |||
| CGEMVNKERNEL = zgemv_n.c | |||
| ZGEMVNKERNEL = zgemv_n.c | |||
| CGEMVNKERNEL = cgemv_n_vfp.S | |||
| ZGEMVNKERNEL = zgemv_n_vfp.S | |||
| SGEMVTKERNEL = gemv_t_vfp.S | |||
| DGEMVTKERNEL = gemv_t_vfp.S | |||
| CGEMVTKERNEL = zgemv_t.c | |||
| ZGEMVTKERNEL = zgemv_t.c | |||
| CGEMVTKERNEL = cgemv_t_vfp.S | |||
| ZGEMVTKERNEL = zgemv_t_vfp.S | |||
| STRMMKERNEL = strmm_kernel_4x2_vfp.S | |||
| DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S | |||
| @@ -70,15 +70,15 @@ DSCALKERNEL = scal_vfp.S | |||
| CSCALKERNEL = scal_vfp.S | |||
| ZSCALKERNEL = scal_vfp.S | |||
| SGEMVNKERNEL = gemv_n_vfpv3.S | |||
| DGEMVNKERNEL = gemv_n_vfpv3.S | |||
| CGEMVNKERNEL = zgemv_n.c | |||
| ZGEMVNKERNEL = zgemv_n.c | |||
| SGEMVTKERNEL = gemv_t_vfpv3.S | |||
| DGEMVTKERNEL = gemv_t_vfpv3.S | |||
| CGEMVTKERNEL = zgemv_t.c | |||
| ZGEMVTKERNEL = zgemv_t.c | |||
| SGEMVNKERNEL = gemv_n_vfp.S | |||
| DGEMVNKERNEL = gemv_n_vfp.S | |||
| CGEMVNKERNEL = cgemv_n_vfp.S | |||
| ZGEMVNKERNEL = zgemv_n_vfp.S | |||
| SGEMVTKERNEL = gemv_t_vfp.S | |||
| DGEMVTKERNEL = gemv_t_vfp.S | |||
| CGEMVTKERNEL = cgemv_t_vfp.S | |||
| ZGEMVTKERNEL = zgemv_t_vfp.S | |||
| STRMMKERNEL = strmm_kernel_4x4_vfpv3.S | |||
| DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S | |||
| @@ -0,0 +1,697 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2013/11/29 Saar | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * | |||
| **************************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define STACKSIZE 256 | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #define OLD_A r3 | |||
| #define OLD_M r0 | |||
| #define AO1 r0 | |||
| #define N r1 | |||
| #define J r2 | |||
| #define AO2 r4 | |||
| #define XO r5 | |||
| #define YO r6 | |||
| #define LDA r7 | |||
| #define INC_X r8 | |||
| #define INC_Y r9 | |||
| #define I r12 | |||
| #define ALPHA_I [fp, #-236] | |||
| #define ALPHA_R [fp, #-244] | |||
| #define M [fp, #-252 ] | |||
| #define A [fp, #-256 ] | |||
| #define X_PRE 64 | |||
| #define Y_PRE 0 | |||
| #define A_PRE 0 | |||
| /**************************************************************************************/ | |||
| #if !defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fnmacs | |||
| #define KMAC_I fmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| #elif defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fmacs | |||
| #define KMAC_I fnmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| #elif !defined(CONJ) && defined(XCONJ) | |||
| #define KMAC_R fmacs | |||
| #define KMAC_I fnmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I2 fmacs | |||
| #else | |||
| #define KMAC_R fnmacs | |||
| #define KMAC_I fmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I2 fmacs | |||
| #endif | |||
| .macro INIT_F4 | |||
| pld [ YO, #Y_PRE ] | |||
| vsub.f32 s8 , s8 , s8 | |||
| vmov.f32 s9 , s8 | |||
| vmov.f32 s10, s8 | |||
| vmov.f32 s11, s8 | |||
| vmov.f32 s12, s8 | |||
| vmov.f32 s13, s8 | |||
| vmov.f32 s14, s8 | |||
| vmov.f32 s15, s8 | |||
| .endm | |||
| .macro KERNEL_F4X4 | |||
| pld [ XO, #X_PRE ] | |||
| KERNEL_F4X1 | |||
| KERNEL_F4X1 | |||
| KERNEL_F4X1 | |||
| KERNEL_F4X1 | |||
| .endm | |||
| .macro KERNEL_F4X1 | |||
| pld [ AO2, #A_PRE ] | |||
| flds s0 , [ AO1 ] | |||
| flds s1 , [ AO1, #4 ] | |||
| flds s2 , [ AO1, #8 ] | |||
| flds s3 , [ AO1, #12 ] | |||
| flds s4 , [ XO ] | |||
| flds s5 , [ XO, #4 ] | |||
| fmacs s8 , s0, s4 | |||
| fmacs s9 , s0, s5 | |||
| fmacs s10 , s2, s4 | |||
| fmacs s11 , s2, s5 | |||
| KMAC_R s8 , s1, s5 | |||
| KMAC_I s9 , s1, s4 | |||
| KMAC_R s10 , s3, s5 | |||
| KMAC_I s11 , s3, s4 | |||
| flds s0 , [ AO1, #16 ] | |||
| flds s1 , [ AO1, #20 ] | |||
| flds s2 , [ AO1, #24 ] | |||
| flds s3 , [ AO1, #28 ] | |||
| fmacs s12 , s0, s4 | |||
| fmacs s13 , s0, s5 | |||
| fmacs s14 , s2, s4 | |||
| fmacs s15 , s2, s5 | |||
| KMAC_R s12 , s1, s5 | |||
| KMAC_I s13 , s1, s4 | |||
| KMAC_R s14 , s3, s5 | |||
| KMAC_I s15 , s3, s4 | |||
| add XO , XO, #8 | |||
| add AO1 , AO1, LDA | |||
| add AO2 , AO2, LDA | |||
| .endm | |||
| .macro SAVE_F4 | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias YO, { s4 - s7 } | |||
| FMAC_R1 s4 , s0 , s8 | |||
| FMAC_I1 s5 , s0 , s9 | |||
| FMAC_R2 s4 , s1 , s9 | |||
| FMAC_I2 s5 , s1 , s8 | |||
| FMAC_R1 s6 , s0 , s10 | |||
| FMAC_I1 s7 , s0 , s11 | |||
| FMAC_R2 s6 , s1 , s11 | |||
| FMAC_I2 s7 , s1 , s10 | |||
| fstmias YO!, { s4 - s7 } | |||
| fldmias YO, { s4 - s7 } | |||
| FMAC_R1 s4 , s0 , s12 | |||
| FMAC_I1 s5 , s0 , s13 | |||
| FMAC_R2 s4 , s1 , s13 | |||
| FMAC_I2 s5 , s1 , s12 | |||
| FMAC_R1 s6 , s0 , s14 | |||
| FMAC_I1 s7 , s0 , s15 | |||
| FMAC_R2 s6 , s1 , s15 | |||
| FMAC_I2 s7 , s1 , s14 | |||
| fstmias YO!, { s4 - s7 } | |||
| .endm | |||
| .macro INIT_F1 | |||
| vsub.f32 s8 , s8 , s8 | |||
| vmov.f32 s9 , s8 | |||
| .endm | |||
| .macro KERNEL_F1X1 | |||
| flds s0 , [ AO1 ] | |||
| flds s1 , [ AO1, #4 ] | |||
| flds s4 , [ XO ] | |||
| flds s5 , [ XO, #4 ] | |||
| fmacs s8 , s0, s4 | |||
| fmacs s9 , s0, s5 | |||
| KMAC_R s8 , s1, s5 | |||
| KMAC_I s9 , s1, s4 | |||
| add XO , XO, #8 | |||
| add AO1 , AO1, LDA | |||
| .endm | |||
| .macro SAVE_F1 | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias YO, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s8 | |||
| FMAC_I1 s5 , s0 , s9 | |||
| FMAC_R2 s4 , s1 , s9 | |||
| FMAC_I2 s5 , s1 , s8 | |||
| fstmias YO, { s4 - s5 } | |||
| add YO, YO, #8 | |||
| .endm | |||
| /****************************************************************************************/ | |||
| .macro INIT_S4 | |||
| vsub.f32 s8 , s8 , s8 | |||
| vmov.f32 s9 , s8 | |||
| vmov.f32 s10, s8 | |||
| vmov.f32 s11, s8 | |||
| vmov.f32 s12, s8 | |||
| vmov.f32 s13, s8 | |||
| vmov.f32 s14, s8 | |||
| vmov.f32 s15, s8 | |||
| .endm | |||
| .macro KERNEL_S4X4 | |||
| KERNEL_S4X1 | |||
| KERNEL_S4X1 | |||
| KERNEL_S4X1 | |||
| KERNEL_S4X1 | |||
| .endm | |||
| .macro KERNEL_S4X1 | |||
| flds s0 , [ AO1 ] | |||
| flds s1 , [ AO1, #4 ] | |||
| flds s2 , [ AO1, #8 ] | |||
| flds s3 , [ AO1, #12 ] | |||
| flds s4 , [ XO ] | |||
| flds s5 , [ XO, #4 ] | |||
| fmacs s8 , s0, s4 | |||
| fmacs s9 , s0, s5 | |||
| fmacs s10 , s2, s4 | |||
| fmacs s11 , s2, s5 | |||
| KMAC_R s8 , s1, s5 | |||
| KMAC_I s9 , s1, s4 | |||
| KMAC_R s10 , s3, s5 | |||
| KMAC_I s11 , s3, s4 | |||
| flds s0 , [ AO1, #16 ] | |||
| flds s1 , [ AO1, #20 ] | |||
| flds s2 , [ AO1, #24 ] | |||
| flds s3 , [ AO1, #28 ] | |||
| fmacs s12 , s0, s4 | |||
| fmacs s13 , s0, s5 | |||
| fmacs s14 , s2, s4 | |||
| fmacs s15 , s2, s5 | |||
| KMAC_R s12 , s1, s5 | |||
| KMAC_I s13 , s1, s4 | |||
| KMAC_R s14 , s3, s5 | |||
| KMAC_I s15 , s3, s4 | |||
| add XO , XO, INC_X | |||
| add AO1 , AO1, LDA | |||
| add AO2 , AO2, LDA | |||
| .endm | |||
| .macro SAVE_S4 | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias YO, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s8 | |||
| FMAC_I1 s5 , s0 , s9 | |||
| FMAC_R2 s4 , s1 , s9 | |||
| FMAC_I2 s5 , s1 , s8 | |||
| fstmias YO, { s4 - s5 } | |||
| add YO, YO, INC_Y | |||
| fldmias YO, { s6 - s7 } | |||
| FMAC_R1 s6 , s0 , s10 | |||
| FMAC_I1 s7 , s0 , s11 | |||
| FMAC_R2 s6 , s1 , s11 | |||
| FMAC_I2 s7 , s1 , s10 | |||
| fstmias YO, { s6 - s7 } | |||
| add YO, YO, INC_Y | |||
| fldmias YO, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s12 | |||
| FMAC_I1 s5 , s0 , s13 | |||
| FMAC_R2 s4 , s1 , s13 | |||
| FMAC_I2 s5 , s1 , s12 | |||
| fstmias YO, { s4 - s5 } | |||
| add YO, YO, INC_Y | |||
| fldmias YO, { s6 - s7 } | |||
| FMAC_R1 s6 , s0 , s14 | |||
| FMAC_I1 s7 , s0 , s15 | |||
| FMAC_R2 s6 , s1 , s15 | |||
| FMAC_I2 s7 , s1 , s14 | |||
| fstmias YO, { s6 - s7 } | |||
| add YO, YO, INC_Y | |||
| .endm | |||
| .macro INIT_S1 | |||
| vsub.f32 s8 , s8 , s8 | |||
| vmov.f32 s9 , s8 | |||
| .endm | |||
| .macro KERNEL_S1X1 | |||
| flds s0 , [ AO1 ] | |||
| flds s1 , [ AO1, #4 ] | |||
| flds s4 , [ XO ] | |||
| flds s5 , [ XO, #4 ] | |||
| fmacs s8 , s0, s4 | |||
| fmacs s9 , s0, s5 | |||
| KMAC_R s8 , s1, s5 | |||
| KMAC_I s9 , s1, s4 | |||
| add XO , XO, INC_X | |||
| add AO1 , AO1, LDA | |||
| .endm | |||
| .macro SAVE_S1 | |||
| flds s0, ALPHA_R | |||
| flds s1, ALPHA_I | |||
| fldmias YO, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s8 | |||
| FMAC_I1 s5 , s0 , s9 | |||
| FMAC_R2 s4 , s1 , s9 | |||
| FMAC_I2 s5 , s1 , s8 | |||
| fstmias YO, { s4 - s5 } | |||
| add YO, YO, INC_Y | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| push {r4 - r9 , fp} | |||
| add fp, sp, #28 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| sub r12, fp, #192 | |||
| #if defined(DOUBLE) | |||
| vstm r12, { d8 - d15 } // store floating point registers | |||
| #else | |||
| vstm r12, { s8 - s15 } // store floating point registers | |||
| #endif | |||
| cmp OLD_M, #0 | |||
| ble cgemvn_kernel_L999 | |||
| cmp N, #0 | |||
| ble cgemvn_kernel_L999 | |||
| str OLD_A, A | |||
| str OLD_M, M | |||
| vstr s0 , ALPHA_R | |||
| vstr s1 , ALPHA_I | |||
| ldr INC_X , OLD_INC_X | |||
| ldr INC_Y , OLD_INC_Y | |||
| cmp INC_X, #0 | |||
| beq cgemvn_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq cgemvn_kernel_L999 | |||
| ldr LDA, OLD_LDA | |||
| #if defined(DOUBLE) | |||
| lsl LDA, LDA, #4 // LDA * SIZE * 2 | |||
| #else | |||
| lsl LDA, LDA, #3 // LDA * SIZE * 2 | |||
| #endif | |||
| cmp INC_X, #1 | |||
| bne cgemvn_kernel_S4_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne cgemvn_kernel_S4_BEGIN | |||
| cgemvn_kernel_F4_BEGIN: | |||
| ldr YO , Y | |||
| ldr I, M | |||
| asrs I, I, #2 // I = M / 4 | |||
| ble cgemvn_kernel_F1_BEGIN | |||
| cgemvn_kernel_F4X4: | |||
| ldr AO1, A | |||
| add AO2, AO1, LDA | |||
| add r3 , AO1, #32 | |||
| str r3 , A | |||
| add AO2, AO2, LDA | |||
| add AO2, AO2, LDA | |||
| ldr XO , X | |||
| INIT_F4 | |||
| asrs J, N, #2 // J = N / 4 | |||
| ble cgemvn_kernel_F4X1 | |||
| cgemvn_kernel_F4X4_10: | |||
| KERNEL_F4X4 | |||
| subs J, J, #1 | |||
| bne cgemvn_kernel_F4X4_10 | |||
| cgemvn_kernel_F4X1: | |||
| ands J, N , #3 | |||
| ble cgemvn_kernel_F4_END | |||
| cgemvn_kernel_F4X1_10: | |||
| KERNEL_F4X1 | |||
| subs J, J, #1 | |||
| bne cgemvn_kernel_F4X1_10 | |||
| cgemvn_kernel_F4_END: | |||
| SAVE_F4 | |||
| subs I , I , #1 | |||
| bne cgemvn_kernel_F4X4 | |||
| cgemvn_kernel_F1_BEGIN: | |||
| ldr I, M | |||
| ands I, I , #3 | |||
| ble cgemvn_kernel_L999 | |||
| cgemvn_kernel_F1X1: | |||
| ldr AO1, A | |||
| add r3, AO1, #8 | |||
| str r3, A | |||
| ldr XO , X | |||
| INIT_F1 | |||
| mov J, N | |||
| cgemvn_kernel_F1X1_10: | |||
| KERNEL_F1X1 | |||
| subs J, J, #1 | |||
| bne cgemvn_kernel_F1X1_10 | |||
| cgemvn_kernel_F1_END: | |||
| SAVE_F1 | |||
| subs I , I , #1 | |||
| bne cgemvn_kernel_F1X1 | |||
| b cgemvn_kernel_L999 | |||
| /*************************************************************************************************************/ | |||
| cgemvn_kernel_S4_BEGIN: | |||
| #if defined(DOUBLE) | |||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 | |||
| #else | |||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 | |||
| #endif | |||
| ldr YO , Y | |||
| ldr I, M | |||
| asrs I, I, #2 // I = M / 4 | |||
| ble cgemvn_kernel_S1_BEGIN | |||
| cgemvn_kernel_S4X4: | |||
| ldr AO1, A | |||
| add AO2, AO1, LDA | |||
| add r3 , AO1, #32 | |||
| str r3 , A | |||
| ldr XO , X | |||
| INIT_S4 | |||
| asrs J, N, #2 // J = N / 4 | |||
| ble cgemvn_kernel_S4X1 | |||
| cgemvn_kernel_S4X4_10: | |||
| KERNEL_S4X4 | |||
| subs J, J, #1 | |||
| bne cgemvn_kernel_S4X4_10 | |||
| cgemvn_kernel_S4X1: | |||
| ands J, N , #3 | |||
| ble cgemvn_kernel_S4_END | |||
| cgemvn_kernel_S4X1_10: | |||
| KERNEL_S4X1 | |||
| subs J, J, #1 | |||
| bne cgemvn_kernel_S4X1_10 | |||
| cgemvn_kernel_S4_END: | |||
| SAVE_S4 | |||
| subs I , I , #1 | |||
| bne cgemvn_kernel_S4X4 | |||
| cgemvn_kernel_S1_BEGIN: | |||
| ldr I, M | |||
| ands I, I , #3 | |||
| ble cgemvn_kernel_L999 | |||
| cgemvn_kernel_S1X1: | |||
| ldr AO1, A | |||
| add r3, AO1, #8 | |||
| str r3, A | |||
| ldr XO , X | |||
| INIT_S1 | |||
| mov J, N | |||
| cgemvn_kernel_S1X1_10: | |||
| KERNEL_S1X1 | |||
| subs J, J, #1 | |||
| bne cgemvn_kernel_S1X1_10 | |||
| cgemvn_kernel_S1_END: | |||
| SAVE_S1 | |||
| subs I , I , #1 | |||
| bne cgemvn_kernel_S1X1 | |||
| /*************************************************************************************************************/ | |||
| cgemvn_kernel_L999: | |||
| sub r3, fp, #192 | |||
| #if defined(DOUBLE) | |||
| vldm r3, { d8 - d15 } // restore floating point registers | |||
| #else | |||
| vldm r3, { s8 - s15 } // restore floating point registers | |||
| #endif | |||
| mov r0, #0 // set return value | |||
| sub sp, fp, #28 | |||
| pop {r4 -r9 ,fp} | |||
| bx lr | |||
| EPILOGUE | |||
| @@ -0,0 +1,607 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2013/11/29 Saar | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * | |||
| **************************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define STACKSIZE 256 | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #define OLD_A r3 | |||
| #define OLD_N r1 | |||
| #define M r0 | |||
| #define AO1 r1 | |||
| #define J r2 | |||
| #define AO2 r4 | |||
| #define XO r5 | |||
| #define YO r6 | |||
| #define LDA r7 | |||
| #define INC_X r8 | |||
| #define INC_Y r9 | |||
| #define I r12 | |||
| #define N [fp, #-252 ] | |||
| #define A [fp, #-256 ] | |||
| #define X_PRE 512 | |||
| #define A_PRE 512 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| #if !defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fnmacs | |||
| #define KMAC_I fmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| #elif defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fmacs | |||
| #define KMAC_I fnmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| #elif !defined(CONJ) && defined(XCONJ) | |||
| #define KMAC_R fmacs | |||
| #define KMAC_I fnmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I2 fmacs | |||
| #else | |||
| #define KMAC_R fnmacs | |||
| #define KMAC_I fmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I2 fmacs | |||
| #endif | |||
| .macro INIT_F2 | |||
| vsub.f32 s12, s12, s12 | |||
| vsub.f32 s13, s13, s13 | |||
| vsub.f32 s14, s14, s14 | |||
| vsub.f32 s15, s15, s15 | |||
| .endm | |||
| .macro KERNEL_F2X4 | |||
| KERNEL_F2X1 | |||
| KERNEL_F2X1 | |||
| KERNEL_F2X1 | |||
| KERNEL_F2X1 | |||
| .endm | |||
| .macro KERNEL_F2X1 | |||
| fldmias XO! , { s2 - s3 } | |||
| fldmias AO1!, { s4 - s5 } | |||
| fldmias AO2!, { s8 - s9 } | |||
| fmacs s12 , s4 , s2 | |||
| fmacs s13 , s4 , s3 | |||
| KMAC_R s12 , s5 , s3 | |||
| KMAC_I s13 , s5 , s2 | |||
| fmacs s14 , s8 , s2 | |||
| fmacs s15 , s8 , s3 | |||
| KMAC_R s14 , s9 , s3 | |||
| KMAC_I s15 , s9 , s2 | |||
| .endm | |||
| .macro SAVE_F2 | |||
| fldmias YO, { s4 - s7 } | |||
| FMAC_R1 s4 , s0 , s12 | |||
| FMAC_I1 s5 , s0 , s13 | |||
| FMAC_R2 s4 , s1 , s13 | |||
| FMAC_I2 s5 , s1 , s12 | |||
| FMAC_R1 s6 , s0 , s14 | |||
| FMAC_I1 s7 , s0 , s15 | |||
| FMAC_R2 s6 , s1 , s15 | |||
| FMAC_I2 s7 , s1 , s14 | |||
| fstmias YO!, { s4 - s7 } | |||
| .endm | |||
| /************************************************************************************************/ | |||
| .macro INIT_F1 | |||
| vsub.f32 s12, s12, s12 | |||
| vsub.f32 s13, s13, s13 | |||
| .endm | |||
| .macro KERNEL_F1X4 | |||
| KERNEL_F1X1 | |||
| KERNEL_F1X1 | |||
| KERNEL_F1X1 | |||
| KERNEL_F1X1 | |||
| .endm | |||
| .macro KERNEL_F1X1 | |||
| fldmias XO! , { s2 - s3 } | |||
| fldmias AO1!, { s4 - s5 } | |||
| fmacs s12 , s4 , s2 | |||
| fmacs s13 , s4 , s3 | |||
| KMAC_R s12 , s5 , s3 | |||
| KMAC_I s13 , s5 , s2 | |||
| .endm | |||
| .macro SAVE_F1 | |||
| fldmias YO, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s12 | |||
| FMAC_I1 s5 , s0 , s13 | |||
| FMAC_R2 s4 , s1 , s13 | |||
| FMAC_I2 s5 , s1 , s12 | |||
| fstmias YO!, { s4 - s5 } | |||
| .endm | |||
| /************************************************************************************************/ | |||
| .macro INIT_S2 | |||
| vsub.f32 s12, s12, s12 | |||
| vsub.f32 s13, s13, s13 | |||
| vsub.f32 s14, s14, s14 | |||
| vsub.f32 s15, s15, s15 | |||
| .endm | |||
| .macro KERNEL_S2X4 | |||
| KERNEL_S2X1 | |||
| KERNEL_S2X1 | |||
| KERNEL_S2X1 | |||
| KERNEL_S2X1 | |||
| .endm | |||
| .macro KERNEL_S2X1 | |||
| fldmias XO , { s2 - s3 } | |||
| fldmias AO1!, { s4 - s5 } | |||
| fldmias AO2!, { s8 - s9 } | |||
| fmacs s12 , s4 , s2 | |||
| fmacs s13 , s4 , s3 | |||
| KMAC_R s12 , s5 , s3 | |||
| KMAC_I s13 , s5 , s2 | |||
| fmacs s14 , s8 , s2 | |||
| fmacs s15 , s8 , s3 | |||
| KMAC_R s14 , s9 , s3 | |||
| KMAC_I s15 , s9 , s2 | |||
| add XO, XO, INC_X | |||
| .endm | |||
| .macro SAVE_S2 | |||
| fldmias YO, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s12 | |||
| FMAC_I1 s5 , s0 , s13 | |||
| FMAC_R2 s4 , s1 , s13 | |||
| FMAC_I2 s5 , s1 , s12 | |||
| fstmias YO, { s4 - s5 } | |||
| add YO, YO, INC_Y | |||
| fldmias YO, { s6 - s7 } | |||
| FMAC_R1 s6 , s0 , s14 | |||
| FMAC_I1 s7 , s0 , s15 | |||
| FMAC_R2 s6 , s1 , s15 | |||
| FMAC_I2 s7 , s1 , s14 | |||
| fstmias YO, { s6 - s7 } | |||
| add YO, YO, INC_Y | |||
| .endm | |||
| /************************************************************************************************/ | |||
| .macro INIT_S1 | |||
| vsub.f32 s12, s12, s12 | |||
| vsub.f32 s13, s13, s13 | |||
| .endm | |||
| .macro KERNEL_S1X4 | |||
| KERNEL_S1X1 | |||
| KERNEL_S1X1 | |||
| KERNEL_S1X1 | |||
| KERNEL_S1X1 | |||
| .endm | |||
| .macro KERNEL_S1X1 | |||
| fldmias XO , { s2 - s3 } | |||
| fldmias AO1!, { s4 - s5 } | |||
| fmacs s12 , s4 , s2 | |||
| fmacs s13 , s4 , s3 | |||
| KMAC_R s12 , s5 , s3 | |||
| KMAC_I s13 , s5 , s2 | |||
| add XO, XO, INC_X | |||
| .endm | |||
| .macro SAVE_S1 | |||
| fldmias YO, { s4 - s5 } | |||
| FMAC_R1 s4 , s0 , s12 | |||
| FMAC_I1 s5 , s0 , s13 | |||
| FMAC_R2 s4 , s1 , s13 | |||
| FMAC_I2 s5 , s1 , s12 | |||
| fstmias YO, { s4 - s5 } | |||
| add YO, YO, INC_Y | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| push {r4 - r9 , fp} | |||
| add fp, sp, #28 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| sub r12, fp, #192 | |||
| #if defined(DOUBLE) | |||
| vstm r12, { d8 - d15 } // store floating point registers | |||
| #else | |||
| vstm r12, { s8 - s15 } // store floating point registers | |||
| #endif | |||
| cmp M, #0 | |||
| ble cgemvt_kernel_L999 | |||
| cmp OLD_N, #0 | |||
| ble cgemvt_kernel_L999 | |||
| str OLD_A, A | |||
| str OLD_N, N | |||
| ldr INC_X , OLD_INC_X | |||
| ldr INC_Y , OLD_INC_Y | |||
| cmp INC_X, #0 | |||
| beq cgemvt_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq cgemvt_kernel_L999 | |||
| ldr LDA, OLD_LDA | |||
| #if defined(DOUBLE) | |||
| lsl LDA, LDA, #4 // LDA * SIZE | |||
| #else | |||
| lsl LDA, LDA, #3 // LDA * SIZE | |||
| #endif | |||
| cmp INC_X, #1 | |||
| bne cgemvt_kernel_S2_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne cgemvt_kernel_S2_BEGIN | |||
| cgemvt_kernel_F2_BEGIN: | |||
| ldr YO , Y | |||
| ldr J, N | |||
| asrs J, J, #1 // J = N / 2 | |||
| ble cgemvt_kernel_F1_BEGIN | |||
| cgemvt_kernel_F2X4: | |||
| ldr AO1, A | |||
| add AO2, AO1, LDA | |||
| add r3 , AO2, LDA | |||
| str r3 , A | |||
| ldr XO , X | |||
| INIT_F2 | |||
| asrs I, M, #2 // I = M / 4 | |||
| ble cgemvt_kernel_F2X1 | |||
| cgemvt_kernel_F2X4_10: | |||
| KERNEL_F2X4 | |||
| subs I, I, #1 | |||
| bne cgemvt_kernel_F2X4_10 | |||
| cgemvt_kernel_F2X1: | |||
| ands I, M , #3 | |||
| ble cgemvt_kernel_F2_END | |||
| cgemvt_kernel_F2X1_10: | |||
| KERNEL_F2X1 | |||
| subs I, I, #1 | |||
| bne cgemvt_kernel_F2X1_10 | |||
| cgemvt_kernel_F2_END: | |||
| SAVE_F2 | |||
| subs J , J , #1 | |||
| bne cgemvt_kernel_F2X4 | |||
| cgemvt_kernel_F1_BEGIN: | |||
| ldr J, N | |||
| ands J, J, #1 | |||
| ble cgemvt_kernel_L999 | |||
| cgemvt_kernel_F1X4: | |||
| ldr AO1, A | |||
| ldr XO , X | |||
| INIT_F1 | |||
| asrs I, M, #2 // I = M / 4 | |||
| ble cgemvt_kernel_F1X1 | |||
| cgemvt_kernel_F1X4_10: | |||
| KERNEL_F1X4 | |||
| subs I, I, #1 | |||
| bne cgemvt_kernel_F1X4_10 | |||
| cgemvt_kernel_F1X1: | |||
| ands I, M , #3 | |||
| ble cgemvt_kernel_F1_END | |||
| cgemvt_kernel_F1X1_10: | |||
| KERNEL_F1X1 | |||
| subs I, I, #1 | |||
| bne cgemvt_kernel_F1X1_10 | |||
| cgemvt_kernel_F1_END: | |||
| SAVE_F1 | |||
| b cgemvt_kernel_L999 | |||
| /*************************************************************************************************************/ | |||
| cgemvt_kernel_S2_BEGIN: | |||
| #if defined(DOUBLE) | |||
| lsl INC_X, INC_X, #4 // INC_X * SIZE | |||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE | |||
| #else | |||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||
| #endif | |||
| ldr YO , Y | |||
| ldr J, N | |||
| asrs J, J, #1 // J = N / 2 | |||
| ble cgemvt_kernel_S1_BEGIN | |||
| cgemvt_kernel_S2X4: | |||
| ldr AO1, A | |||
| add AO2, AO1, LDA | |||
| add r3 , AO2, LDA | |||
| str r3 , A | |||
| ldr XO , X | |||
| INIT_S2 | |||
| asrs I, M, #2 // I = M / 4 | |||
| ble cgemvt_kernel_S2X1 | |||
| cgemvt_kernel_S2X4_10: | |||
| KERNEL_S2X4 | |||
| subs I, I, #1 | |||
| bne cgemvt_kernel_S2X4_10 | |||
| cgemvt_kernel_S2X1: | |||
| ands I, M , #3 | |||
| ble cgemvt_kernel_S2_END | |||
| cgemvt_kernel_S2X1_10: | |||
| KERNEL_S2X1 | |||
| subs I, I, #1 | |||
| bne cgemvt_kernel_S2X1_10 | |||
| cgemvt_kernel_S2_END: | |||
| SAVE_S2 | |||
| subs J , J , #1 | |||
| bne cgemvt_kernel_S2X4 | |||
| cgemvt_kernel_S1_BEGIN: | |||
| ldr J, N | |||
| ands J, J, #1 | |||
| ble cgemvt_kernel_L999 | |||
| cgemvt_kernel_S1X4: | |||
| ldr AO1, A | |||
| ldr XO , X | |||
| INIT_S1 | |||
| asrs I, M, #2 // I = M / 4 | |||
| ble cgemvt_kernel_S1X1 | |||
| cgemvt_kernel_S1X4_10: | |||
| KERNEL_S1X4 | |||
| subs I, I, #1 | |||
| bne cgemvt_kernel_S1X4_10 | |||
| cgemvt_kernel_S1X1: | |||
| ands I, M , #3 | |||
| ble cgemvt_kernel_S1_END | |||
| cgemvt_kernel_S1X1_10: | |||
| KERNEL_S1X1 | |||
| subs I, I, #1 | |||
| bne cgemvt_kernel_S1X1_10 | |||
| cgemvt_kernel_S1_END: | |||
| SAVE_S1 | |||
| /*************************************************************************************************************/ | |||
| cgemvt_kernel_L999: | |||
| sub r3, fp, #192 | |||
| #if defined(DOUBLE) | |||
| vldm r3, { d8 - d15 } // restore floating point registers | |||
| #else | |||
| vldm r3, { s8 - s15 } // restore floating point registers | |||
| #endif | |||
| mov r0, #0 // set return value | |||
| sub sp, fp, #28 | |||
| pop {r4 -r9 ,fp} | |||
| bx lr | |||
| EPILOGUE | |||
| @@ -0,0 +1,699 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2013/11/29 Saar | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * | |||
| **************************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define STACKSIZE 256 | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #define OLD_A r3 | |||
| #define OLD_M r0 | |||
| #define AO1 r0 | |||
| #define N r1 | |||
| #define J r2 | |||
| #define AO2 r4 | |||
| #define XO r5 | |||
| #define YO r6 | |||
| #define LDA r7 | |||
| #define INC_X r8 | |||
| #define INC_Y r9 | |||
| #define I r12 | |||
| #define ALPHA_I [fp, #-236] | |||
| #define ALPHA_R [fp, #-244] | |||
| #define M [fp, #-252 ] | |||
| #define A [fp, #-256 ] | |||
| #define X_PRE 64 | |||
| #define Y_PRE 0 | |||
| #define A_PRE 0 | |||
| /**************************************************************************************/ | |||
| #if !defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fnmacd | |||
| #define KMAC_I fmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #elif defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fmacd | |||
| #define KMAC_I fnmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #elif !defined(CONJ) && defined(XCONJ) | |||
| #define KMAC_R fmacd | |||
| #define KMAC_I fnmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I2 fmacd | |||
| #else | |||
| #define KMAC_R fnmacd | |||
| #define KMAC_I fmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I2 fmacd | |||
| #endif | |||
| .macro INIT_F4 | |||
| pld [ YO, #Y_PRE ] | |||
| vsub.f64 d8 , d8 , d8 | |||
| vmov.f64 d9 , d8 | |||
| vmov.f64 d10, d8 | |||
| vmov.f64 d11, d8 | |||
| vmov.f64 d12, d8 | |||
| vmov.f64 d13, d8 | |||
| vmov.f64 d14, d8 | |||
| vmov.f64 d15, d8 | |||
| .endm | |||
| .macro KERNEL_F4X4 | |||
| pld [ XO, #X_PRE ] | |||
| KERNEL_F4X1 | |||
| KERNEL_F4X1 | |||
| pld [ XO, #X_PRE ] | |||
| KERNEL_F4X1 | |||
| KERNEL_F4X1 | |||
| .endm | |||
| .macro KERNEL_F4X1 | |||
| fldd d0 , [ AO1 ] | |||
| fldd d4 , [ XO ] | |||
| fldd d5 , [ XO, #8 ] | |||
| pld [ AO2, #A_PRE ] | |||
| fldd d1 , [ AO1, #8 ] | |||
| fmacd d8 , d0, d4 | |||
| fldd d2 , [ AO1, #16 ] | |||
| fmacd d9 , d0, d5 | |||
| fldd d3 , [ AO1, #24 ] | |||
| fmacd d10 , d2, d4 | |||
| fldd d0 , [ AO1, #32 ] | |||
| fmacd d11 , d2, d5 | |||
| KMAC_R d8 , d1, d5 | |||
| KMAC_I d9 , d1, d4 | |||
| KMAC_R d10 , d3, d5 | |||
| fldd d1 , [ AO1, #40 ] | |||
| KMAC_I d11 , d3, d4 | |||
| fldd d2 , [ AO1, #48 ] | |||
| fmacd d12 , d0, d4 | |||
| fldd d3 , [ AO1, #56 ] | |||
| fmacd d13 , d0, d5 | |||
| pld [ AO2, #A_PRE+32 ] | |||
| fmacd d14 , d2, d4 | |||
| fmacd d15 , d2, d5 | |||
| KMAC_R d12 , d1, d5 | |||
| add XO , XO, #16 | |||
| KMAC_I d13 , d1, d4 | |||
| add AO1 , AO1, LDA | |||
| KMAC_R d14 , d3, d5 | |||
| add AO2 , AO2, LDA | |||
| KMAC_I d15 , d3, d4 | |||
| .endm | |||
| .macro SAVE_F4 | |||
| fldd d0, ALPHA_R | |||
| fldd d1, ALPHA_I | |||
| fldmiad YO, { d4 - d7 } | |||
| FMAC_R1 d4 , d0 , d8 | |||
| FMAC_I1 d5 , d0 , d9 | |||
| FMAC_R2 d4 , d1 , d9 | |||
| FMAC_I2 d5 , d1 , d8 | |||
| FMAC_R1 d6 , d0 , d10 | |||
| FMAC_I1 d7 , d0 , d11 | |||
| FMAC_R2 d6 , d1 , d11 | |||
| FMAC_I2 d7 , d1 , d10 | |||
| fstmiad YO!, { d4 - d7 } | |||
| fldmiad YO, { d4 - d7 } | |||
| FMAC_R1 d4 , d0 , d12 | |||
| FMAC_I1 d5 , d0 , d13 | |||
| FMAC_R2 d4 , d1 , d13 | |||
| FMAC_I2 d5 , d1 , d12 | |||
| FMAC_R1 d6 , d0 , d14 | |||
| FMAC_I1 d7 , d0 , d15 | |||
| FMAC_R2 d6 , d1 , d15 | |||
| FMAC_I2 d7 , d1 , d14 | |||
| fstmiad YO!, { d4 - d7 } | |||
| .endm | |||
| .macro INIT_F1 | |||
| vsub.f64 d8 , d8 , d8 | |||
| vmov.f64 d9 , d8 | |||
| .endm | |||
| .macro KERNEL_F1X1 | |||
| fldd d0 , [ AO1 ] | |||
| fldd d1 , [ AO1, #8 ] | |||
| fldd d4 , [ XO ] | |||
| fldd d5 , [ XO, #8 ] | |||
| fmacd d8 , d0, d4 | |||
| fmacd d9 , d0, d5 | |||
| KMAC_R d8 , d1, d5 | |||
| KMAC_I d9 , d1, d4 | |||
| add XO , XO, #16 | |||
| add AO1 , AO1, LDA | |||
| .endm | |||
| .macro SAVE_F1 | |||
| fldd d0, ALPHA_R | |||
| fldd d1, ALPHA_I | |||
| fldmiad YO, { d4 - d5 } | |||
| FMAC_R1 d4 , d0 , d8 | |||
| FMAC_I1 d5 , d0 , d9 | |||
| FMAC_R2 d4 , d1 , d9 | |||
| FMAC_I2 d5 , d1 , d8 | |||
| fstmiad YO, { d4 - d5 } | |||
| add YO, YO, #16 | |||
| .endm | |||
| /****************************************************************************************/ | |||
| .macro INIT_S4 | |||
| vsub.f64 d8 , d8 , d8 | |||
| vmov.f64 d9 , d8 | |||
| vmov.f64 d10, d8 | |||
| vmov.f64 d11, d8 | |||
| vmov.f64 d12, d8 | |||
| vmov.f64 d13, d8 | |||
| vmov.f64 d14, d8 | |||
| vmov.f64 d15, d8 | |||
| .endm | |||
| .macro KERNEL_S4X4 | |||
| KERNEL_S4X1 | |||
| KERNEL_S4X1 | |||
| KERNEL_S4X1 | |||
| KERNEL_S4X1 | |||
| .endm | |||
| .macro KERNEL_S4X1 | |||
| fldd d0 , [ AO1 ] | |||
| fldd d1 , [ AO1, #8 ] | |||
| fldd d2 , [ AO1, #16 ] | |||
| fldd d3 , [ AO1, #24 ] | |||
| fldd d4 , [ XO ] | |||
| fldd d5 , [ XO, #8 ] | |||
| fmacd d8 , d0, d4 | |||
| fmacd d9 , d0, d5 | |||
| fmacd d10 , d2, d4 | |||
| fmacd d11 , d2, d5 | |||
| KMAC_R d8 , d1, d5 | |||
| KMAC_I d9 , d1, d4 | |||
| KMAC_R d10 , d3, d5 | |||
| KMAC_I d11 , d3, d4 | |||
| fldd d0 , [ AO1, #32 ] | |||
| fldd d1 , [ AO1, #40 ] | |||
| fldd d2 , [ AO1, #48 ] | |||
| fldd d3 , [ AO1, #56 ] | |||
| fmacd d12 , d0, d4 | |||
| fmacd d13 , d0, d5 | |||
| fmacd d14 , d2, d4 | |||
| fmacd d15 , d2, d5 | |||
| KMAC_R d12 , d1, d5 | |||
| KMAC_I d13 , d1, d4 | |||
| KMAC_R d14 , d3, d5 | |||
| KMAC_I d15 , d3, d4 | |||
| add XO , XO, INC_X | |||
| add AO1 , AO1, LDA | |||
| add AO2 , AO2, LDA | |||
| .endm | |||
| .macro SAVE_S4 | |||
| fldd d0, ALPHA_R | |||
| fldd d1, ALPHA_I | |||
| fldmiad YO, { d4 - d5 } | |||
| FMAC_R1 d4 , d0 , d8 | |||
| FMAC_I1 d5 , d0 , d9 | |||
| FMAC_R2 d4 , d1 , d9 | |||
| FMAC_I2 d5 , d1 , d8 | |||
| fstmiad YO, { d4 - d5 } | |||
| add YO, YO, INC_Y | |||
| fldmiad YO, { d6 - d7 } | |||
| FMAC_R1 d6 , d0 , d10 | |||
| FMAC_I1 d7 , d0 , d11 | |||
| FMAC_R2 d6 , d1 , d11 | |||
| FMAC_I2 d7 , d1 , d10 | |||
| fstmiad YO, { d6 - d7 } | |||
| add YO, YO, INC_Y | |||
| fldmiad YO, { d4 - d5 } | |||
| FMAC_R1 d4 , d0 , d12 | |||
| FMAC_I1 d5 , d0 , d13 | |||
| FMAC_R2 d4 , d1 , d13 | |||
| FMAC_I2 d5 , d1 , d12 | |||
| fstmiad YO, { d4 - d5 } | |||
| add YO, YO, INC_Y | |||
| fldmiad YO, { d6 - d7 } | |||
| FMAC_R1 d6 , d0 , d14 | |||
| FMAC_I1 d7 , d0 , d15 | |||
| FMAC_R2 d6 , d1 , d15 | |||
| FMAC_I2 d7 , d1 , d14 | |||
| fstmiad YO, { d6 - d7 } | |||
| add YO, YO, INC_Y | |||
| .endm | |||
| .macro INIT_S1 | |||
| vsub.f64 d8 , d8 , d8 | |||
| vmov.f64 d9 , d8 | |||
| .endm | |||
| .macro KERNEL_S1X1 | |||
| fldd d0 , [ AO1 ] | |||
| fldd d1 , [ AO1, #8 ] | |||
| fldd d4 , [ XO ] | |||
| fldd d5 , [ XO, #8 ] | |||
| fmacd d8 , d0, d4 | |||
| fmacd d9 , d0, d5 | |||
| KMAC_R d8 , d1, d5 | |||
| KMAC_I d9 , d1, d4 | |||
| add XO , XO, INC_X | |||
| add AO1 , AO1, LDA | |||
| .endm | |||
| .macro SAVE_S1 | |||
| fldd d0, ALPHA_R | |||
| fldd d1, ALPHA_I | |||
| fldmiad YO, { d4 - d5 } | |||
| FMAC_R1 d4 , d0 , d8 | |||
| FMAC_I1 d5 , d0 , d9 | |||
| FMAC_R2 d4 , d1 , d9 | |||
| FMAC_I2 d5 , d1 , d8 | |||
| fstmiad YO, { d4 - d5 } | |||
| add YO, YO, INC_Y | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| push {r4 - r9 , fp} | |||
| add fp, sp, #28 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| sub r12, fp, #192 | |||
| #if defined(DOUBLE) | |||
| vstm r12, { d8 - d15 } // store floating point registers | |||
| #else | |||
| vstm r12, { s8 - s15 } // store floating point registers | |||
| #endif | |||
| cmp OLD_M, #0 | |||
| ble zgemvn_kernel_L999 | |||
| cmp N, #0 | |||
| ble zgemvn_kernel_L999 | |||
| str OLD_A, A | |||
| str OLD_M, M | |||
| vstr d0 , ALPHA_R | |||
| vstr d1 , ALPHA_I | |||
| ldr INC_X , OLD_INC_X | |||
| ldr INC_Y , OLD_INC_Y | |||
| cmp INC_X, #0 | |||
| beq zgemvn_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq zgemvn_kernel_L999 | |||
| ldr LDA, OLD_LDA | |||
| #if defined(DOUBLE) | |||
| lsl LDA, LDA, #4 // LDA * SIZE * 2 | |||
| #else | |||
| lsl LDA, LDA, #3 // LDA * SIZE * 2 | |||
| #endif | |||
| cmp INC_X, #1 | |||
| bne zgemvn_kernel_S4_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne zgemvn_kernel_S4_BEGIN | |||
| zgemvn_kernel_F4_BEGIN: | |||
| ldr YO , Y | |||
| ldr I, M | |||
| asrs I, I, #2 // I = M / 4 | |||
| ble zgemvn_kernel_F1_BEGIN | |||
| zgemvn_kernel_F4X4: | |||
| ldr AO1, A | |||
| add AO2, AO1, LDA | |||
| add r3 , AO1, #64 | |||
| str r3 , A | |||
| add AO2, AO2, LDA | |||
| add AO2, AO2, LDA | |||
| ldr XO , X | |||
| INIT_F4 | |||
| asrs J, N, #2 // J = N / 4 | |||
| ble zgemvn_kernel_F4X1 | |||
| zgemvn_kernel_F4X4_10: | |||
| KERNEL_F4X4 | |||
| subs J, J, #1 | |||
| bne zgemvn_kernel_F4X4_10 | |||
| zgemvn_kernel_F4X1: | |||
| ands J, N , #3 | |||
| ble zgemvn_kernel_F4_END | |||
| zgemvn_kernel_F4X1_10: | |||
| KERNEL_F4X1 | |||
| subs J, J, #1 | |||
| bne zgemvn_kernel_F4X1_10 | |||
| zgemvn_kernel_F4_END: | |||
| SAVE_F4 | |||
| subs I , I , #1 | |||
| bne zgemvn_kernel_F4X4 | |||
| zgemvn_kernel_F1_BEGIN: | |||
| ldr I, M | |||
| ands I, I , #3 | |||
| ble zgemvn_kernel_L999 | |||
| zgemvn_kernel_F1X1: | |||
| ldr AO1, A | |||
| add r3, AO1, #16 | |||
| str r3, A | |||
| ldr XO , X | |||
| INIT_F1 | |||
| mov J, N | |||
| zgemvn_kernel_F1X1_10: | |||
| KERNEL_F1X1 | |||
| subs J, J, #1 | |||
| bne zgemvn_kernel_F1X1_10 | |||
| zgemvn_kernel_F1_END: | |||
| SAVE_F1 | |||
| subs I , I , #1 | |||
| bne zgemvn_kernel_F1X1 | |||
| b zgemvn_kernel_L999 | |||
| /*************************************************************************************************************/ | |||
| zgemvn_kernel_S4_BEGIN: | |||
| #if defined(DOUBLE) | |||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 | |||
| #else | |||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 | |||
| #endif | |||
| ldr YO , Y | |||
| ldr I, M | |||
| asrs I, I, #2 // I = M / 4 | |||
| ble zgemvn_kernel_S1_BEGIN | |||
| zgemvn_kernel_S4X4: | |||
| ldr AO1, A | |||
| add AO2, AO1, LDA | |||
| add r3 , AO1, #64 | |||
| str r3 , A | |||
| ldr XO , X | |||
| INIT_S4 | |||
| asrs J, N, #2 // J = N / 4 | |||
| ble zgemvn_kernel_S4X1 | |||
| zgemvn_kernel_S4X4_10: | |||
| KERNEL_S4X4 | |||
| subs J, J, #1 | |||
| bne zgemvn_kernel_S4X4_10 | |||
| zgemvn_kernel_S4X1: | |||
| ands J, N , #3 | |||
| ble zgemvn_kernel_S4_END | |||
| zgemvn_kernel_S4X1_10: | |||
| KERNEL_S4X1 | |||
| subs J, J, #1 | |||
| bne zgemvn_kernel_S4X1_10 | |||
| zgemvn_kernel_S4_END: | |||
| SAVE_S4 | |||
| subs I , I , #1 | |||
| bne zgemvn_kernel_S4X4 | |||
| zgemvn_kernel_S1_BEGIN: | |||
| ldr I, M | |||
| ands I, I , #3 | |||
| ble zgemvn_kernel_L999 | |||
| zgemvn_kernel_S1X1: | |||
| ldr AO1, A | |||
| add r3, AO1, #16 | |||
| str r3, A | |||
| ldr XO , X | |||
| INIT_S1 | |||
| mov J, N | |||
| zgemvn_kernel_S1X1_10: | |||
| KERNEL_S1X1 | |||
| subs J, J, #1 | |||
| bne zgemvn_kernel_S1X1_10 | |||
| zgemvn_kernel_S1_END: | |||
| SAVE_S1 | |||
| subs I , I , #1 | |||
| bne zgemvn_kernel_S1X1 | |||
| /*************************************************************************************************************/ | |||
| zgemvn_kernel_L999: | |||
| sub r3, fp, #192 | |||
| #if defined(DOUBLE) | |||
| vldm r3, { d8 - d15 } // restore floating point registers | |||
| #else | |||
| vldm r3, { s8 - s15 } // restore floating point registers | |||
| #endif | |||
| mov r0, #0 // set return value | |||
| sub sp, fp, #28 | |||
| pop {r4 -r9 ,fp} | |||
| bx lr | |||
| EPILOGUE | |||
| @@ -0,0 +1,608 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2013/11/29 Saar | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * | |||
| **************************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define STACKSIZE 256 | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #define OLD_A r3 | |||
| #define OLD_N r1 | |||
| #define M r0 | |||
| #define AO1 r1 | |||
| #define J r2 | |||
| #define AO2 r4 | |||
| #define XO r5 | |||
| #define YO r6 | |||
| #define LDA r7 | |||
| #define INC_X r8 | |||
| #define INC_Y r9 | |||
| #define I r12 | |||
| #define N [fp, #-252 ] | |||
| #define A [fp, #-256 ] | |||
| #define X_PRE 512 | |||
| #define A_PRE 512 | |||
| #define Y_PRE 32 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| #if !defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fnmacd | |||
| #define KMAC_I fmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #elif defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fmacd | |||
| #define KMAC_I fnmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #elif !defined(CONJ) && defined(XCONJ) | |||
| #define KMAC_R fmacd | |||
| #define KMAC_I fnmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I2 fmacd | |||
| #else | |||
| #define KMAC_R fnmacd | |||
| #define KMAC_I fmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I2 fmacd | |||
| #endif | |||
| .macro INIT_F2 | |||
| vsub.f64 d12, d12, d12 | |||
| vsub.f64 d13, d13, d13 | |||
| vsub.f64 d14, d14, d14 | |||
| vsub.f64 d15, d15, d15 | |||
| .endm | |||
| .macro KERNEL_F2X4 | |||
| KERNEL_F2X1 | |||
| KERNEL_F2X1 | |||
| KERNEL_F2X1 | |||
| KERNEL_F2X1 | |||
| .endm | |||
| .macro KERNEL_F2X1 | |||
| fldmiad XO! , { d2 - d3 } | |||
| fldmiad AO1!, { d4 - d5 } | |||
| fmacd d12 , d4 , d2 | |||
| fmacd d13 , d4 , d3 | |||
| fldmiad AO2!, { d8 - d9 } | |||
| KMAC_R d12 , d5 , d3 | |||
| KMAC_I d13 , d5 , d2 | |||
| fmacd d14 , d8 , d2 | |||
| fmacd d15 , d8 , d3 | |||
| KMAC_R d14 , d9 , d3 | |||
| KMAC_I d15 , d9 , d2 | |||
| .endm | |||
| .macro SAVE_F2 | |||
| fldmiad YO, { d4 - d7 } | |||
| FMAC_R1 d4 , d0 , d12 | |||
| FMAC_I1 d5 , d0 , d13 | |||
| FMAC_R2 d4 , d1 , d13 | |||
| FMAC_I2 d5 , d1 , d12 | |||
| FMAC_R1 d6 , d0 , d14 | |||
| FMAC_I1 d7 , d0 , d15 | |||
| FMAC_R2 d6 , d1 , d15 | |||
| FMAC_I2 d7 , d1 , d14 | |||
| fstmiad YO!, { d4 - d7 } | |||
| .endm | |||
| /************************************************************************************************/ | |||
| .macro INIT_F1 | |||
| vsub.f64 d12, d12, d12 | |||
| vsub.f64 d13, d13, d13 | |||
| .endm | |||
| .macro KERNEL_F1X4 | |||
| KERNEL_F1X1 | |||
| KERNEL_F1X1 | |||
| KERNEL_F1X1 | |||
| KERNEL_F1X1 | |||
| .endm | |||
| .macro KERNEL_F1X1 | |||
| fldmiad XO! , { d2 - d3 } | |||
| fldmiad AO1!, { d4 - d5 } | |||
| fmacd d12 , d4 , d2 | |||
| fmacd d13 , d4 , d3 | |||
| KMAC_R d12 , d5 , d3 | |||
| KMAC_I d13 , d5 , d2 | |||
| .endm | |||
| .macro SAVE_F1 | |||
| fldmiad YO, { d4 - d5 } | |||
| FMAC_R1 d4 , d0 , d12 | |||
| FMAC_I1 d5 , d0 , d13 | |||
| FMAC_R2 d4 , d1 , d13 | |||
| FMAC_I2 d5 , d1 , d12 | |||
| fstmiad YO!, { d4 - d5 } | |||
| .endm | |||
| /************************************************************************************************/ | |||
| .macro INIT_S2 | |||
| vsub.f64 d12, d12, d12 | |||
| vsub.f64 d13, d13, d13 | |||
| vsub.f64 d14, d14, d14 | |||
| vsub.f64 d15, d15, d15 | |||
| .endm | |||
| .macro KERNEL_S2X4 | |||
| KERNEL_S2X1 | |||
| KERNEL_S2X1 | |||
| KERNEL_S2X1 | |||
| KERNEL_S2X1 | |||
| .endm | |||
| .macro KERNEL_S2X1 | |||
| fldmiad XO , { d2 - d3 } | |||
| fldmiad AO1!, { d4 - d5 } | |||
| fldmiad AO2!, { d8 - d9 } | |||
| fmacd d12 , d4 , d2 | |||
| fmacd d13 , d4 , d3 | |||
| KMAC_R d12 , d5 , d3 | |||
| KMAC_I d13 , d5 , d2 | |||
| fmacd d14 , d8 , d2 | |||
| fmacd d15 , d8 , d3 | |||
| KMAC_R d14 , d9 , d3 | |||
| KMAC_I d15 , d9 , d2 | |||
| add XO, XO, INC_X | |||
| .endm | |||
| .macro SAVE_S2 | |||
| fldmiad YO, { d4 - d5 } | |||
| FMAC_R1 d4 , d0 , d12 | |||
| FMAC_I1 d5 , d0 , d13 | |||
| FMAC_R2 d4 , d1 , d13 | |||
| FMAC_I2 d5 , d1 , d12 | |||
| fstmiad YO, { d4 - d5 } | |||
| add YO, YO, INC_Y | |||
| fldmiad YO, { d6 - d7 } | |||
| FMAC_R1 d6 , d0 , d14 | |||
| FMAC_I1 d7 , d0 , d15 | |||
| FMAC_R2 d6 , d1 , d15 | |||
| FMAC_I2 d7 , d1 , d14 | |||
| fstmiad YO, { d6 - d7 } | |||
| add YO, YO, INC_Y | |||
| .endm | |||
| /************************************************************************************************/ | |||
| .macro INIT_S1 | |||
| vsub.f64 d12, d12, d12 | |||
| vsub.f64 d13, d13, d13 | |||
| .endm | |||
| .macro KERNEL_S1X4 | |||
| KERNEL_S1X1 | |||
| KERNEL_S1X1 | |||
| KERNEL_S1X1 | |||
| KERNEL_S1X1 | |||
| .endm | |||
| .macro KERNEL_S1X1 | |||
| fldmiad XO , { d2 - d3 } | |||
| fldmiad AO1!, { d4 - d5 } | |||
| fmacd d12 , d4 , d2 | |||
| fmacd d13 , d4 , d3 | |||
| KMAC_R d12 , d5 , d3 | |||
| KMAC_I d13 , d5 , d2 | |||
| add XO, XO, INC_X | |||
| .endm | |||
| .macro SAVE_S1 | |||
| fldmiad YO, { d4 - d5 } | |||
| FMAC_R1 d4 , d0 , d12 | |||
| FMAC_I1 d5 , d0 , d13 | |||
| FMAC_R2 d4 , d1 , d13 | |||
| FMAC_I2 d5 , d1 , d12 | |||
| fstmiad YO, { d4 - d5 } | |||
| add YO, YO, INC_Y | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| push {r4 - r9 , fp} | |||
| add fp, sp, #28 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| sub r12, fp, #192 | |||
| #if defined(DOUBLE) | |||
| vstm r12, { d8 - d15 } // store floating point registers | |||
| #else | |||
| vstm r12, { s8 - s15 } // store floating point registers | |||
| #endif | |||
| cmp M, #0 | |||
| ble zgemvt_kernel_L999 | |||
| cmp OLD_N, #0 | |||
| ble zgemvt_kernel_L999 | |||
| str OLD_A, A | |||
| str OLD_N, N | |||
| ldr INC_X , OLD_INC_X | |||
| ldr INC_Y , OLD_INC_Y | |||
| cmp INC_X, #0 | |||
| beq zgemvt_kernel_L999 | |||
| cmp INC_Y, #0 | |||
| beq zgemvt_kernel_L999 | |||
| ldr LDA, OLD_LDA | |||
| #if defined(DOUBLE) | |||
| lsl LDA, LDA, #4 // LDA * SIZE | |||
| #else | |||
| lsl LDA, LDA, #3 // LDA * SIZE | |||
| #endif | |||
| cmp INC_X, #1 | |||
| bne zgemvt_kernel_S2_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne zgemvt_kernel_S2_BEGIN | |||
| zgemvt_kernel_F2_BEGIN: | |||
| ldr YO , Y | |||
| ldr J, N | |||
| asrs J, J, #1 // J = N / 2 | |||
| ble zgemvt_kernel_F1_BEGIN | |||
| zgemvt_kernel_F2X4: | |||
| ldr AO1, A | |||
| add AO2, AO1, LDA | |||
| add r3 , AO2, LDA | |||
| str r3 , A | |||
| ldr XO , X | |||
| INIT_F2 | |||
| asrs I, M, #2 // I = M / 4 | |||
| ble zgemvt_kernel_F2X1 | |||
| zgemvt_kernel_F2X4_10: | |||
| KERNEL_F2X4 | |||
| subs I, I, #1 | |||
| bne zgemvt_kernel_F2X4_10 | |||
| zgemvt_kernel_F2X1: | |||
| ands I, M , #3 | |||
| ble zgemvt_kernel_F2_END | |||
| zgemvt_kernel_F2X1_10: | |||
| KERNEL_F2X1 | |||
| subs I, I, #1 | |||
| bne zgemvt_kernel_F2X1_10 | |||
| zgemvt_kernel_F2_END: | |||
| SAVE_F2 | |||
| subs J , J , #1 | |||
| bne zgemvt_kernel_F2X4 | |||
| zgemvt_kernel_F1_BEGIN: | |||
| ldr J, N | |||
| ands J, J, #1 | |||
| ble zgemvt_kernel_L999 | |||
| zgemvt_kernel_F1X4: | |||
| ldr AO1, A | |||
| ldr XO , X | |||
| INIT_F1 | |||
| asrs I, M, #2 // I = M / 4 | |||
| ble zgemvt_kernel_F1X1 | |||
| zgemvt_kernel_F1X4_10: | |||
| KERNEL_F1X4 | |||
| subs I, I, #1 | |||
| bne zgemvt_kernel_F1X4_10 | |||
| zgemvt_kernel_F1X1: | |||
| ands I, M , #3 | |||
| ble zgemvt_kernel_F1_END | |||
| zgemvt_kernel_F1X1_10: | |||
| KERNEL_F1X1 | |||
| subs I, I, #1 | |||
| bne zgemvt_kernel_F1X1_10 | |||
| zgemvt_kernel_F1_END: | |||
| SAVE_F1 | |||
| b zgemvt_kernel_L999 | |||
| /*************************************************************************************************************/ | |||
| zgemvt_kernel_S2_BEGIN: | |||
| #if defined(DOUBLE) | |||
| lsl INC_X, INC_X, #4 // INC_X * SIZE | |||
| lsl INC_Y, INC_Y, #4 // INC_Y * SIZE | |||
| #else | |||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||
| lsl INC_Y, INC_Y, #3 // INC_Y * SIZE | |||
| #endif | |||
| ldr YO , Y | |||
| ldr J, N | |||
| asrs J, J, #1 // J = N / 2 | |||
| ble zgemvt_kernel_S1_BEGIN | |||
| zgemvt_kernel_S2X4: | |||
| ldr AO1, A | |||
| add AO2, AO1, LDA | |||
| add r3 , AO2, LDA | |||
| str r3 , A | |||
| ldr XO , X | |||
| INIT_S2 | |||
| asrs I, M, #2 // I = M / 4 | |||
| ble zgemvt_kernel_S2X1 | |||
| zgemvt_kernel_S2X4_10: | |||
| KERNEL_S2X4 | |||
| subs I, I, #1 | |||
| bne zgemvt_kernel_S2X4_10 | |||
| zgemvt_kernel_S2X1: | |||
| ands I, M , #3 | |||
| ble zgemvt_kernel_S2_END | |||
| zgemvt_kernel_S2X1_10: | |||
| KERNEL_S2X1 | |||
| subs I, I, #1 | |||
| bne zgemvt_kernel_S2X1_10 | |||
| zgemvt_kernel_S2_END: | |||
| SAVE_S2 | |||
| subs J , J , #1 | |||
| bne zgemvt_kernel_S2X4 | |||
| zgemvt_kernel_S1_BEGIN: | |||
| ldr J, N | |||
| ands J, J, #1 | |||
| ble zgemvt_kernel_L999 | |||
| zgemvt_kernel_S1X4: | |||
| ldr AO1, A | |||
| ldr XO , X | |||
| INIT_S1 | |||
| asrs I, M, #2 // I = M / 4 | |||
| ble zgemvt_kernel_S1X1 | |||
| zgemvt_kernel_S1X4_10: | |||
| KERNEL_S1X4 | |||
| subs I, I, #1 | |||
| bne zgemvt_kernel_S1X4_10 | |||
| zgemvt_kernel_S1X1: | |||
| ands I, M , #3 | |||
| ble zgemvt_kernel_S1_END | |||
| zgemvt_kernel_S1X1_10: | |||
| KERNEL_S1X1 | |||
| subs I, I, #1 | |||
| bne zgemvt_kernel_S1X1_10 | |||
| zgemvt_kernel_S1_END: | |||
| SAVE_S1 | |||
| /*************************************************************************************************************/ | |||
| zgemvt_kernel_L999: | |||
| sub r3, fp, #192 | |||
| #if defined(DOUBLE) | |||
| vldm r3, { d8 - d15 } // restore floating point registers | |||
| #else | |||
| vldm r3, { s8 - s15 } // restore floating point registers | |||
| #endif | |||
| mov r0, #0 // set return value | |||
| sub sp, fp, #28 | |||
| pop {r4 -r9 ,fp} | |||
| bx lr | |||
| EPILOGUE | |||