/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/23 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * * 2013/11/02 Saar * UNROLL_N 4 * UNROLL_M 4 * DGEMM_P 128 * DGEMM_Q 240 * DGEMM_R 12288 * A_PRE 128 * B_PRE 128 * C_PRE 32 * * Performance on Odroid U2: * * 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS * 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS * 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS * 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS **************************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6*/ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc*/ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define offset x7 #define counterL x8 #define counterI x9 #define pB x10 #define counterJ x11 #define tempALPHA x12 #define pCRow0 x13 #define pCRow1 x14 #define pCRow2 x15 #define pA x16 // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset // 08 counterL // 09 counterI // 10 pB // 11 counterJ // 12 tempALPHA // 13 pCRow0 // 14 pCRow1 // 15 pCRow2 // 16 pA // 17 // 18 must save // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 orig ALPHA -> a00 //v01 a01 //v02 a02 //v03 a03 //v04 a10 //v05 a11 //v06 a12 //v07 a13 //v08 must save b00 //v09 must save b01 //v10 must save b02 //v11 must save b03 //v12 must save b10 //v13 must save b11 //v14 must save b12 //v15 must save b13 //v16 must save C00 //v17 must save C01 //v18 C02 //v19 C03 //v20 C10 //v21 C11 //v22 C12 //v23 C13 //v24 C20 //v25 C21 //v26 C22 //v27 C23 //v28 C30 //v29 C31 //v30 C32 //v31 C33 // add sp,sp,#-(6*16) // stp x18,x19,[sp,#(0*16)] // stp x20,x21,[sp,#(1*16)] /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT4x4 fsub v16.4s , v16.4s , v16.4s fsub v20.4s , v20.4s , v20.4s fsub v24.4s , v24.4s , v24.4s fsub v28.4s , v28.4s , v28.4s .endm .macro KERNEL4x4_I ld1 {v8.2s},[pB],#8 ld1 {v10.2s},[pB],#8 ld1 {v0.4s},[pA],#16 fmulx v16.4s, v0.4s, v8.4s[0] fmulx v20.4s, v0.4s, v8.4s[1] fmulx v24.4s, v0.4s, v10.4s[0] fmulx v28.4s, v0.4s, v10.4s[1] ld1 {v12.2s},[pB],#8 // for next round ld1 {v14.2s},[pB],#8 // for next round ld1 {v4.4s},[pA],#16 // for next round .endm .macro KERNEL4x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v20.4s, v4.4s, v12.s[1] fmla v24.4s, v4.4s, v14.s[0] fmla v28.4s, v4.4s, v14.s[1] ld1 {v8.2s},[pB],#8 ld1 {v10.2s},[pB],#8 ld1 {v0.4s},[pA],#16 .endm .macro KERNEL4x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v24.4s, v0.4s, v10.s[0] fmla v28.4s, v0.4s, v10.s[1] ld1 {v12.2s},[pB],#8 ld1 {v14.2s},[pB],#8 ld1 {v4.4s},[pA],#16 .endm .macro KERNEL4x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v20.4s, v4.4s, v12.s[1] fmla v24.4s, v4.4s, v14.s[0] fmla v28.4s, v4.4s, v14.s[1] .endm .macro KERNEL4x4_SUB ld1 {v8.2s},[pB],#8 ld1 {v10.2s},[pB],#8 ld1 {v0.4s} , [pA],#16 fmla v16.4s, v0.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v24.4s, v0.4s, v10.s[0] fmla v28.4s, v0.4s, v10.s[1] .endm .macro SAVE4x4 add pCRow1, pCRow0, LDC // create a second row pointer from the first row pointer mov v0.d[0], tempALPHA ld1 {v8.4s},[pCRow0] // load 4 values of C from first row fmla v8.4s ,v16.4s,v0.s[0] st1 {v8.4s},[pCRow0],#16 // store C from first row ld1 {v12.4s},[pCRow1] // load 4 values of C from second row fmla v12.4s ,v20.4s,v0.s[0] st1 {v12.4s},[pCRow1] // store C from second row add pCRow2, pCRow1, LDC // Row2 points to third row ld1 {v8.4s},[pCRow2] // load 4 values of C from third row fmla v8.4s ,v24.4s,v0.s[0] st1 {v8.4s} ,[pCRow2] // store C from third row add pCRow1, pCRow2 , LDC // row1 points to fourth row ld1 {v12.4s},[pCRow1] // load 4 values of C from fourth row fmla v12.4s ,v28.4s,v0.s[0] st1 {v12.4s},[pCRow1] // store fourth row .endm /******************************************************************************/ .macro INIT2x4 fsub s16 , s16 , s16 fmov s17, s16 fmov s20, s16 fmov s21, s16 fmov s24, s16 fmov s25, s16 fmov s28, s16 fmov s29, s16 .endm .macro KERNEL2x4_SUB ldr s8 , [ pB ] ldr s9 , [ pB, #4 ] ldr s10, [ pB, #8 ] ldr s11, [ pB, #12 ] ldr s0 , [ pA ] ldr s1 , [ pA, #4 ] fmadd s16 , s0, s8, s16 fmadd s17 , s1, s8, s17 fmadd s20 , s0, s9, s20 fmadd s21 , s1, s9, s21 fmadd s24 , s0, s10, s24 fmadd s25 , s1, s10, s25 fmadd s28 , s0, s11, s28 fmadd s29 , s1, s11, s29 add pA , pA, #8 add pB , pB, #16 .endm #define F1ST( op1, op2, op3) fmadd op1, op2, op3, op1 #define L1ST( op1, op2, op3) ldr op1, [op2, op3] .macro SAVE2x4 add pCRow1 , pCRow0, LDC add pCRow2 , pCRow1, LDC mov v0.d[0], tempALPHA L1ST ( s8,pCRow0, #0) L1ST ( s9,pCRow0, #4 ) F1ST ( s8 , s0 , s16) F1ST ( s9 , s0 , s17) str s8 , [pCRow0, #0] str s9 , [pCRow0, #4 ] ldr s12, [pCRow1, #0] ldr s13, [pCRow1, #4 ] F1ST ( s12, s0 , s20) F1ST ( s13, s0 , s21) str s12, [pCRow1, #0] str s13, [pCRow1, #4 ] L1ST ( s8,pCRow2 , #0) L1ST ( s9,pCRow2 , #4 ) F1ST ( s8 , s0 , s24) F1ST ( s9 , s0 , s25) str s8 , [pCRow2 , #0] str s9 , [pCRow2 , #4 ] add pCRow1, pCRow2 , LDC ldr s12, [pCRow1, #0] ldr s13, [pCRow1, #4 ] F1ST ( s12, s0 , s28) F1ST ( s13, s0 , s29) str s12, [pCRow1, #0] str s13, [pCRow1, #4 ] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x4 fsub s16 , s16 , s16 fmov s20, s16 fmov s24, s16 fmov s28, s16 .endm .macro KERNEL1x4_SUB ldr s8 , [ pB ] ldr s9 , [ pB, #4 ] ldr s10, [ pB, #8 ] ldr s11, [ pB, #12 ] ldr s0 , [ pA ] fmadd s16 , s0, s8, s16 fmadd s20 , s0, s9, s20 fmadd s24 , s0, s10, s24 fmadd s28 , s0, s11, s28 add pA , pA, #4 add pB , pB, #16 .endm .macro SAVE1x4 add pCRow1 , pCRow0, LDC add pCRow2 , pCRow1, LDC mov v0.d[0], tempALPHA L1ST ( s8,pCRow0, #0) F1ST ( s8 , s0 , s16) str s8 , [pCRow0, #0] L1ST ( s12,pCRow1, #0) F1ST ( s12, s0 , s20) str s12, [pCRow1, #0] L1ST ( s8,pCRow2 , #0) F1ST ( s8 , s0 , s24) str s8 , [pCRow2 , #0] add pCRow1, pCRow2 , LDC L1ST ( s12,pCRow1, #0) F1ST ( s12, s0 , s28) str s12, [pCRow1, #0] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ /******************************************************************************/ .macro INIT4x2 fsub s16 , s16 , s16 fmov s17, s16 fmov s18, s16 fmov s19, s16 fmov s20, s16 fmov s21, s16 fmov s22, s16 fmov s23, s16 .endm .macro KERNEL4x2_SUB ldr s8 , [ pB ] ldr s9 , [ pB, #4 ] ldr s0 , [ pA ] ldr s1 , [ pA, #4 ] ldr s2 , [ pA, #8 ] ldr s3 , [ pA, #12 ] fmadd s16 , s0, s8, s16 fmadd s17 , s1, s8, s17 fmadd s18 , s2, s8, s18 fmadd s19 , s3, s8, s19 fmadd s20 , s0, s9, s20 fmadd s21 , s1, s9, s21 fmadd s22 , s2, s9, s22 fmadd s23 , s3, s9, s23 add pA , pA, #16 add pB , pB, #8 .endm .macro SAVE4x2 add pCRow1 , pCRow0, LDC mov v0.d[0], tempALPHA L1ST ( s8,pCRow0, #0) L1ST ( s9,pCRow0, #4 ) L1ST ( s10,pCRow0, #8 ) L1ST ( s11,pCRow0, #12 ) F1ST ( s8 , s0 , s16) F1ST ( s9 , s0 , s17) F1ST ( s10, s0 , s18) F1ST ( s11, s0 , s19) str s8 , [pCRow0] str s9 , [pCRow0, #4 ] str s10, [pCRow0, #8 ] str s11, [pCRow0, #12 ] L1ST ( s12,pCRow1, #0) L1ST ( s13,pCRow1, #4 ) L1ST ( s14,pCRow1, #8 ) L1ST ( s15,pCRow1, #12 ) F1ST ( s12, s0 , s20) F1ST ( s13, s0 , s21) F1ST ( s14, s0 , s22) F1ST ( s15, s0 , s23) str s12, [pCRow1] str s13, [pCRow1, #4 ] str s14, [pCRow1, #8 ] str s15, [pCRow1, #12 ] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x2 fsub s16 , s16 , s16 fmov s17, s16 fmov s20, s16 fmov s21, s16 .endm .macro KERNEL2x2_SUB ldr s8 , [ pB ] ldr s9 , [ pB, #4 ] ldr s0 , [ pA ] ldr s1 , [ pA, #4 ] fmadd s16 , s0, s8, s16 fmadd s17 , s1, s8, s17 fmadd s20 , s0, s9, s20 fmadd s21 , s1, s9, s21 add pA , pA, #8 add pB , pB, #8 .endm .macro SAVE2x2 add pCRow1 , pCRow0, LDC mov v0.d[0], tempALPHA L1ST ( s8,pCRow0, #0 ) L1ST ( s9,pCRow0, #4 ) F1ST ( s8 , s0 , s16) F1ST ( s9 , s0 , s17) str s8 , [pCRow0] str s9 , [pCRow0, #4 ] L1ST ( s12,pCRow1, #0 ) L1ST ( s13,pCRow1, #4 ) F1ST ( s12, s0 , s20) F1ST ( s13, s0 , s21) str s12, [pCRow1] str s13, [pCRow1, #4 ] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x2 fsub s16 , s16 , s16 fmov s20, s16 .endm .macro KERNEL1x2_SUB ldr s8 , [ pB ] ldr s9 , [ pB, #4 ] ldr s0 , [ pA ] fmadd s16 , s0, s8, s16 fmadd s20 , s0, s9, s20 add pA , pA, #4 add pB , pB, #8 .endm .macro SAVE1x2 add pCRow1 , pCRow0, LDC mov v0.d[0], tempALPHA L1ST ( s8,pCRow0, #0) F1ST ( s8 , s0 , s16) str s8 , [pCRow0] L1ST ( s12,pCRow1, #0) F1ST ( s12, s0 , s20) str s12, [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ /******************************************************************************/ .macro INIT4x1 fsub s16 , s16 , s16 fmov s17, s16 fmov s18, s16 fmov s19, s16 .endm .macro KERNEL4x1_SUB ldr s8 , [ pB ] ldr s0 , [ pA ] ldr s1 , [ pA, #4 ] ldr s2 , [ pA, #8 ] ldr s3 , [ pA, #12 ] fmadd s16 , s0, s8, s16 fmadd s17 , s1, s8, s17 fmadd s18 , s2, s8, s18 fmadd s19 , s3, s8, s19 add pA , pA, #16 add pB , pB, #4 .endm .macro SAVE4x1 mov v0.d[0], tempALPHA L1ST ( s8,pCRow0, #0 ) L1ST ( s9,pCRow0, #4 ) L1ST ( s10,pCRow0, #8 ) L1ST ( s11,pCRow0, #12 ) F1ST ( s8 , s0 , s16) F1ST ( s9 , s0 , s17) F1ST ( s10, s0 , s18) F1ST ( s11, s0 , s19) str s8 , [pCRow0] str s9 , [pCRow0, #4 ] str s10, [pCRow0, #8 ] str s11, [pCRow0, #12 ] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x1 fsub s16 , s16 , s16 fmov s17, s16 .endm .macro KERNEL2x1_SUB ldr s8 , [ pB ] ldr s0 , [ pA ] ldr s1 , [ pA, #4 ] fmadd s16 , s0, s8, s16 fmadd s17 , s1, s8, s17 add pA , pA, #8 add pB , pB, #4 .endm .macro SAVE2x1 mov v0.d[0], tempALPHA L1ST ( s8,pCRow0, #0 ) L1ST ( s9,pCRow0, #4 ) F1ST ( s8 , s0 , s16) F1ST ( s9 , s0 , s17) str s8 , [pCRow0] str s9 , [pCRow0, #4 ] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x1 fsub s16 , s16 , s16 .endm .macro KERNEL1x1_SUB ldr s8 , [ pB ] ldr s0 , [ pA ] fmadd s16 , s0, s8, s16 add pA , pA, #4 add pB , pB, #4 .endm .macro SAVE1x1 mov v0.d[0], tempALPHA L1ST ( s8,pCRow0, #0 ) F1ST ( s8 , s0 , s16) str s8 , [pCRow0] add pCRow0, pCRow0, #4 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 add sp,sp,#-(5*16) stp d8,d9,[sp,#(0*16)] stp d10,d11,[sp,#(1*16)] stp d12,d13,[sp,#(2*16)] stp d14,d15,[sp,#(3*16)] stp d16,d17,[sp,#(4*16)] mov tempALPHA, v0.d[0] lsl LDC, LDC, #2 // ldc = ldc * 4 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble sgemm_kernel_L2_BEGIN sgemm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC,pC,LDC, lsl #2 mov pA, origPA // pA = start of A array sgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble sgemm_kernel_L4_M2_BEGIN sgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt sgemm_kernel_L4_M4_32 KERNEL4x4_I //do one in the K KERNEL4x4_M2 //do another in the K subs counterL, counterL, #2 // subtract 2, since one is always done at the tail ble sgemm_kernel_L4_M4_22a .align 5 sgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M4_22 sgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b sgemm_kernel_L4_M4_44 sgemm_kernel_L4_M4_32: // less than 4 to do in the K direction tst counterL, #1 ble sgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b sgemm_kernel_L4_M4_44 sgemm_kernel_L4_M4_40: INIT4x4 sgemm_kernel_L4_M4_44: ands counterL , origK, #1 ble sgemm_kernel_L4_M4_100 sgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 bne sgemm_kernel_L4_M4_46 sgemm_kernel_L4_M4_100: SAVE4x4 sgemm_kernel_L4_M4_END: subs counterI, counterI, #1 bne sgemm_kernel_L4_M4_20 sgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L4_M1_BEGIN sgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M2_40 sgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M2_22 sgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M2_100 sgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M2_42 sgemm_kernel_L4_M2_100: SAVE2x4 sgemm_kernel_L4_M2_END: sgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L4_END sgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M1_40 sgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M1_22 sgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M1_100 sgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M1_42 sgemm_kernel_L4_M1_100: SAVE1x4 sgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- bgt sgemm_kernel_L4_BEGIN /*********************************************************************************************/ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble sgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble sgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC , pC, LDC, lsl #1 mov pA, origPA // pA = A sgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble sgemm_kernel_L2_M2_BEGIN sgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M4_40 .align 5 sgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M4_22 sgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M4_100 sgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M4_42 sgemm_kernel_L2_M4_100: SAVE4x2 sgemm_kernel_L2_M4_END: subs counterI, counterI, #1 bgt sgemm_kernel_L2_M4_20 sgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L2_M1_BEGIN sgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M2_40 sgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M2_22 sgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M2_100 sgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M2_42 sgemm_kernel_L2_M2_100: SAVE2x2 sgemm_kernel_L2_M2_END: sgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L2_END sgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble sgemm_kernel_L2_M1_40 sgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M1_22 sgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M1_100 sgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M1_42 sgemm_kernel_L2_M1_100: SAVE1x2 sgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /*********************************************************************************************/ sgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble sgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pCRow0 , LDC // C01 is the current line, update pC to point to next mov pA, origPA // pA = A sgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble sgemm_kernel_L1_M2_BEGIN sgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M4_40 .align 5 sgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M4_22 sgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M4_100 sgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M4_42 sgemm_kernel_L1_M4_100: SAVE4x1 sgemm_kernel_L1_M4_END: subs counterI, counterI, #1 bgt sgemm_kernel_L1_M4_20 sgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L1_M1_BEGIN sgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M2_40 sgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M2_22 sgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M2_100 sgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M2_42 sgemm_kernel_L1_M2_100: SAVE2x1 sgemm_kernel_L1_M2_END: sgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L1_END sgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M1_40 sgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M1_22 sgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M1_100 sgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M1_42 sgemm_kernel_L1_M1_100: SAVE1x1 sgemm_kernel_L1_END: sgemm_kernel_L999: mov x0, #0 // set return value ldp d8,d9,[sp,#(0*16)] ldp d10,d11,[sp,#(1*16)] ldp d12,d13,[sp,#(2*16)] ldp d14,d15,[sp,#(3*16)] ldp d16,d17,[sp,#(4*16)] add sp,sp,#(5*16) ret EPILOGUE