| @@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||
| ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||
| ZGEMMKERNEL = zgemm_kernel_power9.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| @@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xscvdpspn alpha_r,vs1 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| @@ -53,9 +53,9 @@ LSGEMM_L8x16_BEGIN: | |||
| LSGEMM_L8x16_LOOP_START: | |||
| LOAD8x16_0 /*we already zeroed */ | |||
| ##OffsetA=64 OffsetB=32 | |||
| addi AO,AO,2112 | |||
| addi BO,BO,32 | |||
| /*##OffsetA=64 OffsetB=32 | |||
| #addi AO,AO,2112 | |||
| #addi BO,BO,32 */ | |||
| mtctr L | |||
| @@ -63,29 +63,29 @@ LSGEMM_L8x16_LOOP_START: | |||
| LSGEMM_L8x16_LOOP: | |||
| KERNEL8x16_I1_L4_2 -2048,0, 0,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 1,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 2,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 3,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 4,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 5,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 6,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 7,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 8,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 9,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 10,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 11,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 12,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 13,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 14,0 | |||
| KERNEL8x16_I1_L4_2 -2048,0, 15,1 | |||
| KERNEL8x16_I1_L4_2 64,32, 0,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 1,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 2,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 3,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 4,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 5,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 6,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 7,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 8,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 9,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 10,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 11,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 12,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 13,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 14,0 | |||
| KERNEL8x16_I1_L4_2 64,32, 15,1 | |||
| bdnz LSGEMM_L8x16_LOOP | |||
| MY_ALIGN | |||
| LSGEMM_L8x16_LOOP_END: | |||
| END8x16 0, AO, BO, -2048, 0 | |||
| END8x16 0, AO, BO, 64, 32 | |||
| b LSGEMM_L8x16_SUB1 | |||
| MY_ALIGN | |||
| @@ -0,0 +1,257 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE 32192 | |||
| #define FZERO 312+192(SP) | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define o0 0 | |||
| #define alpha_r vs30 | |||
| #define alpha_i vs31 | |||
| #define VECSAVE r11 | |||
| #define FRAMEPOINTER r12 | |||
| #define BBUFFER r14 | |||
| #define L r15 | |||
| #define ALPHA r16 | |||
| #define T5 r17 | |||
| #define T2 r19 | |||
| #define BBO r20 | |||
| #define o8 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define T3 r28 | |||
| #define T4 r29 | |||
| #define PRE r30 | |||
| #define T1 r31 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| mr FRAMEPOINTER, SP | |||
| addi SP, SP, -STACKSIZE | |||
| addi SP, SP, -STACKSIZE | |||
| addi SP, SP, -STACKSIZE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv v20, 288(SP) | |||
| stxv v21, 304(SP) | |||
| stxv v22, 320(SP) | |||
| stxv v23, 336(SP) | |||
| stxv v24, 352(SP) | |||
| stxv v25, 368(SP) | |||
| stxv v26, 384(SP) | |||
| stxv v27, 400(SP) | |||
| stxv v28, 416(SP) | |||
| stxv v29, 432(SP) | |||
| stxv v30, 448(SP) | |||
| stxv v31, 464(SP) | |||
| stw r0, FZERO | |||
| #ifdef linux | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if defined(linux) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #include "zgemm_macros_power9.S" | |||
| cmpwi cr0, M, 0 | |||
| ble L999 | |||
| cmpwi cr0, N, 0 | |||
| ble L999 | |||
| cmpwi cr0, K, 0 | |||
| ble L999 | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 512 | |||
| li o8 , 8 | |||
| li o16 , 16 | |||
| addi BBUFFER, SP, 512+4096 | |||
| li T1, -4096 | |||
| and BBUFFER, BBUFFER, T1 | |||
| addi ALPHA, SP, 296+192 | |||
| xxlor alpha_r,vs1,vs1 /*copy from register f1 */ | |||
| xxlor alpha_i,vs2,vs2 /*copy from register f2 */ | |||
| .align 4 | |||
| #include "zgemm_logic_power9.S" | |||
| L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| lxv v20, 288(SP) | |||
| lxv v21, 304(SP) | |||
| lxv v22, 320(SP) | |||
| lxv v23, 336(SP) | |||
| lxv v24, 352(SP) | |||
| lxv v25, 368(SP) | |||
| lxv v26, 384(SP) | |||
| lxv v27, 400(SP) | |||
| lxv v28, 416(SP) | |||
| lxv v29, 432(SP) | |||
| lxv v30, 448(SP) | |||
| lxv v31, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| addi SP, SP, STACKSIZE | |||
| addi SP, SP, STACKSIZE | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -0,0 +1,857 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define MY_ALIGN .align 3 | |||
| srawi. J, N, 1 | |||
| ble ZGEMM_L2_END | |||
| ZGEMM_L2_BEGIN: | |||
| mr BO, B | |||
| mr BBO, BBUFFER | |||
| srawi. T1, K, 2 | |||
| ble ZGEMM_L2_COPYB1 | |||
| ZGEMM_L2_COPYB8: | |||
| addi T2, PRE, 128 | |||
| dcbt BO, PRE | |||
| dcbtst BBO, PRE | |||
| dcbtst BBO, T2 | |||
| ZCOPYB_8 | |||
| addic. T1, T1, -1 | |||
| bgt ZGEMM_L2_COPYB8 | |||
| ZGEMM_L2_COPYB1: | |||
| andi. T1, K, 3 | |||
| ble ZGEMM_L2_COPYB_END | |||
| ZGEMM_L2_COPYB_LOOP: | |||
| ZCOPYB_2 | |||
| addic. T1, T1, -1 | |||
| bgt ZGEMM_L2_COPYB_LOOP | |||
| ZGEMM_L2_COPYB_END: | |||
| mr CO, C | |||
| mr AO, A | |||
| slwi T1, LDC , 1 | |||
| add C, C, T1 | |||
| srawi. I, M, 3 | |||
| ble ZGEMM_L2x8_END | |||
| ZGEMM_L2x8_BEGIN: | |||
| mr BO, BBUFFER | |||
| mr T1, K | |||
| addi T1,T1, -1 | |||
| srawi. L, T1, 5 /**(K-1) % 32x */ | |||
| ZERO2x8 | |||
| ble ZGEMM_L2x8_SUB0 | |||
| ZGEMM_L2x8_LOOP_START: | |||
| LOAD2x8 0 | |||
| li T2, 1024 | |||
| li T3, 1024+512 | |||
| li T4, 2048 | |||
| li T5, 2048+512 | |||
| mtctr L | |||
| MY_ALIGN | |||
| ZGEMM_L2x8_LOOP: | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL2x8_L 128,64,0,0 | |||
| KERNEL2x8_L 128,64,1,0 | |||
| dcbt AO, T2 | |||
| KERNEL2x8_L 128,64,2,0 | |||
| KERNEL2x8_L 128,64,3,0 | |||
| dcbt AO, T3 | |||
| dcbt BO, T2 | |||
| KERNEL2x8_L 128,64,4,0 | |||
| KERNEL2x8_L 128,64,5,0 | |||
| dcbt AO, T4 | |||
| KERNEL2x8_L 128,64,6,0 | |||
| KERNEL2x8_L 128,64,7,0 | |||
| dcbt AO, T5 | |||
| dcbt BO, T3 | |||
| KERNEL2x8_L 128,64,8,0 | |||
| KERNEL2x8_L 128,64,9,0 | |||
| KERNEL2x8_L 128,64,10,0 | |||
| KERNEL2x8_L 128,64,11,0 | |||
| dcbt BO, T4 | |||
| KERNEL2x8_L 128,64,12,0 | |||
| KERNEL2x8_L 128,64,13,0 | |||
| KERNEL2x8_L 128,64,14,0 | |||
| KERNEL2x8_L 128,64,15,1 | |||
| bdnz ZGEMM_L2x8_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L2x8_LOOP_END: | |||
| END2x8 AO, BO, 128, 64 | |||
| b ZGEMM_L2x8_SUB1 | |||
| ZGEMM_L2x8_SUB0: | |||
| andi. L, K, 63 | |||
| b ZGEMM_L2x8_SUB2 | |||
| ZGEMM_L2x8_SUB1: | |||
| andi. L, T1, 31 | |||
| ble ZGEMM_L2x8_SAVE | |||
| ZGEMM_L2x8_SUB2: | |||
| srawi. T1,L, 3 | |||
| ble ZGEMM_L2x8_SUB2_4 | |||
| mtctr T1 | |||
| MY_ALIGN | |||
| ZGEMM_L2x8_SUB2_LOOP: | |||
| LOAD2x8 0 | |||
| KERNEL2x8_L 128,64, 0,0 | |||
| KERNEL2x8_L 128,64, 1,0 | |||
| KERNEL2x8_L 128,64, 2,0 | |||
| KERNEL2x8_E 128,64, 3,1 | |||
| bdnz ZGEMM_L2x8_SUB2_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L2x8_SUB2_4: | |||
| andi. T1,L, 4 | |||
| ble ZGEMM_L2x8_SUB2_2 | |||
| LOAD2x8 0 | |||
| KERNEL2x8_L 128,64, 0,0 | |||
| KERNEL2x8_E 128,64, 1,1 | |||
| MY_ALIGN | |||
| ZGEMM_L2x8_SUB2_2: | |||
| andi. T1,L, 2 | |||
| ble ZGEMM_L2x8_SUB2_1 | |||
| LOAD2x8 0 | |||
| KERNEL2x8_E 128,64, 0,1 | |||
| MY_ALIGN | |||
| ZGEMM_L2x8_SUB2_1: | |||
| andi. T1,L, 1 | |||
| ble ZGEMM_L2x8_SAVE | |||
| KERNEL2x8 | |||
| /* addic. L, L, -1 | |||
| bgt ZGEMM_L2x8_SUB2_1*/ | |||
| ZGEMM_L2x8_SAVE: | |||
| SAVE2x8 | |||
| addic. I, I, -1 | |||
| bgt ZGEMM_L2x8_BEGIN | |||
| ZGEMM_L2x8_END: | |||
| ZGEMM_L2x4_BEGIN: | |||
| andi. T2, M, 7 | |||
| ble ZGEMM_L2x1_END | |||
| andi. T1, M, 4 | |||
| ble ZGEMM_L2x4_END | |||
| mr BO, BBUFFER | |||
| mr T1, K | |||
| addi T1,T1, -1 | |||
| srawi. L, T1, 4 /**(K-1) % 16x */ | |||
| ZERO2x4 | |||
| ble ZGEMM_L2x4_SUB0 | |||
| ZGEMM_L2x4_LOOP_START: | |||
| LOAD2x4 0 | |||
| mtctr L | |||
| MY_ALIGN | |||
| ZGEMM_L2x4_LOOP: | |||
| KERNEL2x4_L 64,64,0,0 | |||
| KERNEL2x4_L 64,64,1,0 | |||
| KERNEL2x4_L 64,64,2,0 | |||
| KERNEL2x4_L 64,64,3,0 | |||
| KERNEL2x4_L 64,64,4,0 | |||
| KERNEL2x4_L 64,64,5,0 | |||
| KERNEL2x4_L 64,64,6,0 | |||
| KERNEL2x4_L 64,64,7,1 | |||
| bdnz ZGEMM_L2x4_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L2x4_LOOP_END: | |||
| END2x4 AO, BO, 64, 64 | |||
| b ZGEMM_L2x4_SUB1 | |||
| ZGEMM_L2x4_SUB0: | |||
| andi. L, K, 31 | |||
| b ZGEMM_L2x4_SUB2 | |||
| ZGEMM_L2x4_SUB1: | |||
| andi. L, T1, 15 | |||
| ble ZGEMM_L2x4_SAVE | |||
| ZGEMM_L2x4_SUB2: | |||
| srawi. T1,L, 3 | |||
| ble ZGEMM_L2x4_SUB2_4 | |||
| mtctr T1 | |||
| MY_ALIGN | |||
| ZGEMM_L2x4_SUB2_LOOP: | |||
| LOAD2x4 0 | |||
| KERNEL2x4_L 64,64, 0,0 | |||
| KERNEL2x4_L 64,64, 1,0 | |||
| KERNEL2x4_L 64,64, 2,0 | |||
| KERNEL2x4_E 64,64, 3,1 | |||
| bdnz ZGEMM_L2x4_SUB2_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L2x4_SUB2_4: | |||
| andi. T1,L, 4 | |||
| ble ZGEMM_L2x4_SUB2_2 | |||
| LOAD2x4 0 | |||
| KERNEL2x4_L 64,64, 0,0 | |||
| KERNEL2x4_E 64,64, 1,1 | |||
| MY_ALIGN | |||
| ZGEMM_L2x4_SUB2_2: | |||
| andi. T1,L, 2 | |||
| ble ZGEMM_L2x4_SUB2_1 | |||
| LOAD2x4 0 | |||
| KERNEL2x4_E 64,64, 0,1 | |||
| MY_ALIGN | |||
| ZGEMM_L2x4_SUB2_1: | |||
| andi. T1,L, 1 | |||
| ble ZGEMM_L2x4_SAVE | |||
| KERNEL2x4 | |||
| ZGEMM_L2x4_SAVE: | |||
| SAVE2x4 | |||
| ZGEMM_L2x4_END: | |||
| ZGEMM_L2x2_BEGIN: | |||
| andi. T1, M, 2 | |||
| ble ZGEMM_L2x2_END | |||
| mr BO, BBUFFER | |||
| mr T1, K | |||
| addi T1,T1, -1 | |||
| srawi. L, T1, 4 /**(K-1) % 16x */ | |||
| ZERO2x2 | |||
| ble ZGEMM_L2x2_SUB0 | |||
| ZGEMM_L2x2_LOOP_START: | |||
| LOAD2x2 0 | |||
| mtctr L | |||
| MY_ALIGN | |||
| ZGEMM_L2x2_LOOP: | |||
| KERNEL2x2_L 32,64,0,0 | |||
| KERNEL2x2_L 32,64,1,0 | |||
| KERNEL2x2_L 32,64,2,0 | |||
| KERNEL2x2_L 32,64,3,0 | |||
| KERNEL2x2_L 32,64,4,0 | |||
| KERNEL2x2_L 32,64,5,0 | |||
| KERNEL2x2_L 32,64,6,0 | |||
| KERNEL2x2_L 32,64,7,1 | |||
| bdnz ZGEMM_L2x2_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L2x2_LOOP_END: | |||
| END2x2 AO, BO, 32, 64 | |||
| b ZGEMM_L2x2_SUB1 | |||
| ZGEMM_L2x2_SUB0: | |||
| andi. L, K, 31 | |||
| b ZGEMM_L2x2_SUB2 | |||
| ZGEMM_L2x2_SUB1: | |||
| andi. L, T1, 15 | |||
| ble ZGEMM_L2x2_SAVE | |||
| ZGEMM_L2x2_SUB2: | |||
| srawi. T1,L, 3 | |||
| ble ZGEMM_L2x2_SUB2_4 | |||
| mtctr T1 | |||
| MY_ALIGN | |||
| ZGEMM_L2x2_SUB2_LOOP: | |||
| LOAD2x2 0 | |||
| KERNEL2x2_L 32,64, 0,0 | |||
| KERNEL2x2_L 32,64, 1,0 | |||
| KERNEL2x2_L 32,64, 2,0 | |||
| KERNEL2x2_E 32,64, 3,1 | |||
| bdnz ZGEMM_L2x2_SUB2_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L2x2_SUB2_4: | |||
| andi. T1,L, 4 | |||
| ble ZGEMM_L2x2_SUB2_2 | |||
| LOAD2x2 0 | |||
| KERNEL2x2_L 32,64, 0,0 | |||
| KERNEL2x2_E 32,64, 1,1 | |||
| MY_ALIGN | |||
| ZGEMM_L2x2_SUB2_2: | |||
| andi. T1,L, 2 | |||
| ble ZGEMM_L2x2_SUB2_1 | |||
| LOAD2x2 0 | |||
| KERNEL2x2_E 32,64, 0,1 | |||
| MY_ALIGN | |||
| ZGEMM_L2x2_SUB2_1: | |||
| andi. T1,L, 1 | |||
| ble ZGEMM_L2x2_SAVE | |||
| KERNEL2x2 | |||
| ZGEMM_L2x2_SAVE: | |||
| SAVE2x2 | |||
| ZGEMM_L2x2_END: | |||
| ZGEMM_L2x1_BEGIN: | |||
| andi. T1, M, 1 | |||
| ble ZGEMM_L2x1_END | |||
| mr BO, BBUFFER | |||
| mr T1, K | |||
| addi T1,T1, -1 | |||
| srawi. L, T1, 4 /**(K-1) % 16x */ | |||
| ZERO2x1 | |||
| ble ZGEMM_L2x1_SUB0 | |||
| ZGEMM_L2x1_LOOP_START: | |||
| LOAD2x1 0 | |||
| mtctr L | |||
| MY_ALIGN | |||
| ZGEMM_L2x1_LOOP: | |||
| KERNEL2x1_L 16,64,0,0 | |||
| KERNEL2x1_L 16,64,1,0 | |||
| KERNEL2x1_L 16,64,2,0 | |||
| KERNEL2x1_L 16,64,3,0 | |||
| KERNEL2x1_L 16,64,4,0 | |||
| KERNEL2x1_L 16,64,5,0 | |||
| KERNEL2x1_L 16,64,6,0 | |||
| KERNEL2x1_L 16,64,7,1 | |||
| bdnz ZGEMM_L2x1_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L2x1_LOOP_END: | |||
| END2x1 AO, BO, 16, 64 | |||
| b ZGEMM_L2x1_SUB1 | |||
| ZGEMM_L2x1_SUB0: | |||
| andi. L, K, 31 | |||
| b ZGEMM_L2x1_SUB2 | |||
| ZGEMM_L2x1_SUB1: | |||
| andi. L, T1, 15 | |||
| ble ZGEMM_L2x1_SAVE | |||
| ZGEMM_L2x1_SUB2: | |||
| srawi. T1,L, 3 | |||
| ble ZGEMM_L2x1_SUB2_4 | |||
| mtctr T1 | |||
| MY_ALIGN | |||
| ZGEMM_L2x1_SUB2_LOOP: | |||
| LOAD2x1 0 | |||
| KERNEL2x1_L 16,64, 0,0 | |||
| KERNEL2x1_L 16,64, 1,0 | |||
| KERNEL2x1_L 16,64, 2,0 | |||
| KERNEL2x1_E 16,64, 3,1 | |||
| bdnz ZGEMM_L2x1_SUB2_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L2x1_SUB2_4: | |||
| andi. T1,L, 4 | |||
| ble ZGEMM_L2x1_SUB2_2 | |||
| LOAD2x1 0 | |||
| KERNEL2x1_L 16,64, 0,0 | |||
| KERNEL2x1_E 16,64, 1,1 | |||
| MY_ALIGN | |||
| ZGEMM_L2x1_SUB2_2: | |||
| andi. T1,L, 2 | |||
| ble ZGEMM_L2x1_SUB2_1 | |||
| LOAD2x1 0 | |||
| KERNEL2x1_E 16,64, 0,1 | |||
| MY_ALIGN | |||
| ZGEMM_L2x1_SUB2_1: | |||
| andi. T1,L, 1 | |||
| ble ZGEMM_L2x1_SAVE | |||
| KERNEL2x1 | |||
| ZGEMM_L2x1_SAVE: | |||
| SAVE2x1 | |||
| ZGEMM_L2x1_END: | |||
| slwi T1, K, 5 | |||
| add B, B, T1 | |||
| addic. J, J, -1 | |||
| bgt ZGEMM_L2_BEGIN | |||
| andi. T2, N, 1 | |||
| ble L999 | |||
| ZGEMM_L2_END: | |||
| b ZGEMM_L1_BEGIN | |||
| L999_H1: | |||
| b L999 | |||
| ZGEMM_L1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble ZGEMM_L1_END | |||
| mr BO, B | |||
| mr BBO, BBUFFER | |||
| srawi. T1, K, 3 /*this time K/8 */ | |||
| ble ZGEMM_L1_COPYB1 | |||
| ZGEMM_L1_COPYB8: | |||
| addi T2, PRE, 128 | |||
| dcbt BO, PRE | |||
| dcbtst BBO, PRE | |||
| dcbtst BBO, T2 | |||
| ZCOPYB_8 | |||
| addic. T1, T1, -1 | |||
| bgt ZGEMM_L1_COPYB8 | |||
| ZGEMM_L1_COPYB1: | |||
| andi. T1, K, 7 | |||
| ble ZGEMM_L1_COPYB_END | |||
| ZGEMM_L1_COPYB_LOOP: | |||
| ZCOPYB_1 | |||
| addic. T1, T1, -1 | |||
| bgt ZGEMM_L1_COPYB_LOOP | |||
| ZGEMM_L1_COPYB_END: | |||
| mr CO, C | |||
| mr AO, A | |||
| srawi. I, M, 3 | |||
| ble ZGEMM_L1x8_END | |||
| ZGEMM_L1x8_BEGIN: | |||
| mr BO, BBUFFER | |||
| mr T1, K | |||
| addi T1,T1, -1 | |||
| srawi. L, T1, 5 /**(K-1) % 32x */ | |||
| ZERO1x8 | |||
| ble ZGEMM_L1x8_SUB0 | |||
| ZGEMM_L1x8_LOOP_START: | |||
| LOAD1x8 0 | |||
| li T2, 1024 | |||
| li T3, 1024+512 | |||
| li T4, 2048 | |||
| li T5, 2048+512 | |||
| mtctr L | |||
| MY_ALIGN | |||
| ZGEMM_L1x8_LOOP: | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL1x8_L 128,32,0,0 | |||
| KERNEL1x8_L 128,32,1,0 | |||
| dcbt AO, T2 | |||
| KERNEL1x8_L 128,32,2,0 | |||
| KERNEL1x8_L 128,32,3,0 | |||
| dcbt AO, T3 | |||
| dcbt BO, T2 | |||
| KERNEL1x8_L 128,32,4,0 | |||
| KERNEL1x8_L 128,32,5,0 | |||
| dcbt AO, T4 | |||
| KERNEL1x8_L 128,32,6,0 | |||
| KERNEL1x8_L 128,32,7,0 | |||
| dcbt AO, T5 | |||
| dcbt BO, T3 | |||
| KERNEL1x8_L 128,32,8,0 | |||
| KERNEL1x8_L 128,32,9,0 | |||
| KERNEL1x8_L 128,32,10,0 | |||
| KERNEL1x8_L 128,32,11,0 | |||
| dcbt BO, T4 | |||
| KERNEL1x8_L 128,32,12,0 | |||
| KERNEL1x8_L 128,32,13,0 | |||
| KERNEL1x8_L 128,32,14,0 | |||
| KERNEL1x8_L 128,32,15,1 | |||
| bdnz ZGEMM_L1x8_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L1x8_LOOP_END: | |||
| END1x8 AO, BO, 128, 32 | |||
| b ZGEMM_L1x8_SUB1 | |||
| ZGEMM_L1x8_SUB0: | |||
| andi. L, K, 63 | |||
| b ZGEMM_L1x8_SUB2 | |||
| ZGEMM_L1x8_SUB1: | |||
| andi. L, T1, 31 | |||
| ble ZGEMM_L1x8_SAVE | |||
| ZGEMM_L1x8_SUB2: | |||
| srawi. T1,L, 3 | |||
| ble ZGEMM_L1x8_SUB2_4 | |||
| mtctr T1 | |||
| MY_ALIGN | |||
| ZGEMM_L1x8_SUB2_LOOP: | |||
| LOAD1x8 0 | |||
| KERNEL1x8_L 128,32, 0,0 | |||
| KERNEL1x8_L 128,32, 1,0 | |||
| KERNEL1x8_L 128,32, 2,0 | |||
| KERNEL1x8_E 128,32, 3,1 | |||
| bdnz ZGEMM_L1x8_SUB2_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L1x8_SUB2_4: | |||
| andi. T1,L, 4 | |||
| ble ZGEMM_L1x8_SUB2_2 | |||
| LOAD1x8 0 | |||
| KERNEL1x8_L 128,32, 0,0 | |||
| KERNEL1x8_E 128,32, 1,1 | |||
| MY_ALIGN | |||
| ZGEMM_L1x8_SUB2_2: | |||
| andi. T1,L, 2 | |||
| ble ZGEMM_L1x8_SUB2_1 | |||
| LOAD1x8 0 | |||
| KERNEL1x8_E 128,32, 0,1 | |||
| MY_ALIGN | |||
| ZGEMM_L1x8_SUB2_1: | |||
| andi. T1,L, 1 | |||
| ble ZGEMM_L1x8_SAVE | |||
| KERNEL1x8 | |||
| /* addic. L, L, -1 | |||
| bgt ZGEMM_L1x8_SUB2_1*/ | |||
| ZGEMM_L1x8_SAVE: | |||
| SAVE1x8 | |||
| addic. I, I, -1 | |||
| bgt ZGEMM_L1x8_BEGIN | |||
| ZGEMM_L1x8_END: | |||
| ZGEMM_L1x4_BEGIN: | |||
| andi. T2, M, 7 | |||
| ble ZGEMM_L1x1_END | |||
| andi. T1, M, 4 | |||
| ble ZGEMM_L1x4_END | |||
| mr BO, BBUFFER | |||
| mr T1, K | |||
| addi T1,T1, -1 | |||
| srawi. L, T1, 5 /**(K-1) % 16x */ | |||
| ZERO1x4 | |||
| ble ZGEMM_L1x4_SUB0 | |||
| ZGEMM_L1x4_LOOP_START: | |||
| LOAD1x4 0 | |||
| mtctr L | |||
| MY_ALIGN | |||
| ZGEMM_L1x4_LOOP: | |||
| KERNEL1x4_L 64,32,0,0 | |||
| KERNEL1x4_L 64,32,1,0 | |||
| KERNEL1x4_L 64,32,2,0 | |||
| KERNEL1x4_L 64,32,3,0 | |||
| KERNEL1x4_L 64,32,4,0 | |||
| KERNEL1x4_L 64,32,5,0 | |||
| KERNEL1x4_L 64,32,6,0 | |||
| KERNEL1x4_L 64,32,7,0 | |||
| KERNEL1x4_L 64,32,8,0 | |||
| KERNEL1x4_L 64,32,9,0 | |||
| KERNEL1x4_L 64,32,10,0 | |||
| KERNEL1x4_L 64,32,11,0 | |||
| KERNEL1x4_L 64,32,12,0 | |||
| KERNEL1x4_L 64,32,13,0 | |||
| KERNEL1x4_L 64,32,14,0 | |||
| KERNEL1x4_L 64,32,15,1 | |||
| bdnz ZGEMM_L1x4_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L1x4_LOOP_END: | |||
| END1x4 AO, BO, 64, 32 | |||
| b ZGEMM_L1x4_SUB1 | |||
| ZGEMM_L1x4_SUB0: | |||
| andi. L, K, 63 | |||
| b ZGEMM_L1x4_SUB2 | |||
| ZGEMM_L1x4_SUB1: | |||
| andi. L, T1, 31 | |||
| ble ZGEMM_L1x4_SAVE | |||
| ZGEMM_L1x4_SUB2: | |||
| srawi. T1,L, 3 | |||
| ble ZGEMM_L1x4_SUB2_4 | |||
| mtctr T1 | |||
| MY_ALIGN | |||
| ZGEMM_L1x4_SUB2_LOOP: | |||
| LOAD1x4 0 | |||
| KERNEL1x4_L 64,32, 0,0 | |||
| KERNEL1x4_L 64,32, 1,0 | |||
| KERNEL1x4_L 64,32, 2,0 | |||
| KERNEL1x4_E 64,32, 3,1 | |||
| bdnz ZGEMM_L1x4_SUB2_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L1x4_SUB2_4: | |||
| andi. T1,L, 4 | |||
| ble ZGEMM_L1x4_SUB2_2 | |||
| LOAD1x4 0 | |||
| KERNEL1x4_L 64,32, 0,0 | |||
| KERNEL1x4_E 64,32, 1,1 | |||
| MY_ALIGN | |||
| ZGEMM_L1x4_SUB2_2: | |||
| andi. T1,L, 2 | |||
| ble ZGEMM_L1x4_SUB2_1 | |||
| LOAD1x4 0 | |||
| KERNEL1x4_E 64,32, 0,1 | |||
| MY_ALIGN | |||
| ZGEMM_L1x4_SUB2_1: | |||
| andi. T1,L, 1 | |||
| ble ZGEMM_L1x4_SAVE | |||
| KERNEL1x4 | |||
| ZGEMM_L1x4_SAVE: | |||
| SAVE1x4 | |||
| ZGEMM_L1x4_END: | |||
| ZGEMM_L1x2_BEGIN: | |||
| andi. T1, M, 2 | |||
| ble ZGEMM_L1x2_END | |||
| mr BO, BBUFFER | |||
| mr T1, K | |||
| addi T1,T1, -1 | |||
| srawi. L, T1, 5 /**(K-1) % 16x */ | |||
| ZERO1x2 | |||
| ble ZGEMM_L1x2_SUB0 | |||
| ZGEMM_L1x2_LOOP_START: | |||
| LOAD1x2 0 | |||
| mtctr L | |||
| MY_ALIGN | |||
| ZGEMM_L1x2_LOOP: | |||
| KERNEL1x2_L 32,32,0,0 | |||
| KERNEL1x2_L 32,32,1,0 | |||
| KERNEL1x2_L 32,32,2,0 | |||
| KERNEL1x2_L 32,32,3,0 | |||
| KERNEL1x2_L 32,32,4,0 | |||
| KERNEL1x2_L 32,32,5,0 | |||
| KERNEL1x2_L 32,32,6,0 | |||
| KERNEL1x2_L 32,32,7,0 | |||
| KERNEL1x2_L 32,32,8,0 | |||
| KERNEL1x2_L 32,32,9,0 | |||
| KERNEL1x2_L 32,32,10,0 | |||
| KERNEL1x2_L 32,32,11,0 | |||
| KERNEL1x2_L 32,32,12,0 | |||
| KERNEL1x2_L 32,32,13,0 | |||
| KERNEL1x2_L 32,32,14,0 | |||
| KERNEL1x2_L 32,32,15,1 | |||
| bdnz ZGEMM_L1x2_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L1x2_LOOP_END: | |||
| END1x2 AO, BO, 32, 32 | |||
| b ZGEMM_L1x2_SUB1 | |||
| ZGEMM_L1x2_SUB0: | |||
| andi. L, K, 63 | |||
| b ZGEMM_L1x2_SUB2 | |||
| ZGEMM_L1x2_SUB1: | |||
| andi. L, T1, 31 | |||
| ble ZGEMM_L1x2_SAVE | |||
| ZGEMM_L1x2_SUB2: | |||
| srawi. T1,L, 3 | |||
| ble ZGEMM_L1x2_SUB2_4 | |||
| mtctr T1 | |||
| MY_ALIGN | |||
| ZGEMM_L1x2_SUB2_LOOP: | |||
| LOAD1x2 0 | |||
| KERNEL1x2_L 32,32, 0,0 | |||
| KERNEL1x2_L 32,32, 1,0 | |||
| KERNEL1x2_L 32,32, 2,0 | |||
| KERNEL1x2_E 32,32, 3,1 | |||
| bdnz ZGEMM_L1x2_SUB2_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L1x2_SUB2_4: | |||
| andi. T1,L, 4 | |||
| ble ZGEMM_L1x2_SUB2_2 | |||
| LOAD1x2 0 | |||
| KERNEL1x2_L 32,32, 0,0 | |||
| KERNEL1x2_E 32,32, 1,1 | |||
| MY_ALIGN | |||
| ZGEMM_L1x2_SUB2_2: | |||
| andi. T1,L, 2 | |||
| ble ZGEMM_L1x2_SUB2_1 | |||
| LOAD1x2 0 | |||
| KERNEL1x2_E 32,32, 0,1 | |||
| MY_ALIGN | |||
| ZGEMM_L1x2_SUB2_1: | |||
| andi. T1,L, 1 | |||
| ble ZGEMM_L1x2_SAVE | |||
| KERNEL1x2 | |||
| ZGEMM_L1x2_SAVE: | |||
| SAVE1x2 | |||
| ZGEMM_L1x2_END: | |||
| ZGEMM_L1x1_BEGIN: | |||
| andi. T1, M, 1 | |||
| ble ZGEMM_L1x1_END | |||
| mr BO, BBUFFER | |||
| mr T1, K | |||
| addi T1,T1, -1 | |||
| srawi. L, T1, 5 /**(K-1) % 16x */ | |||
| ZERO1x1 | |||
| ble ZGEMM_L1x1_SUB0 | |||
| ZGEMM_L1x1_LOOP_START: | |||
| LOAD1x1 0 | |||
| mtctr L | |||
| MY_ALIGN | |||
| ZGEMM_L1x1_LOOP: | |||
| KERNEL1x1_L 16,32,0,0 | |||
| KERNEL1x1_L 16,32,1,0 | |||
| KERNEL1x1_L 16,32,2,0 | |||
| KERNEL1x1_L 16,32,3,0 | |||
| KERNEL1x1_L 16,32,4,0 | |||
| KERNEL1x1_L 16,32,5,0 | |||
| KERNEL1x1_L 16,32,6,0 | |||
| KERNEL1x1_L 16,32,7,0 | |||
| KERNEL1x1_L 16,32,8,0 | |||
| KERNEL1x1_L 16,32,9,0 | |||
| KERNEL1x1_L 16,32,10,0 | |||
| KERNEL1x1_L 16,32,11,0 | |||
| KERNEL1x1_L 16,32,12,0 | |||
| KERNEL1x1_L 16,32,13,0 | |||
| KERNEL1x1_L 16,32,14,0 | |||
| KERNEL1x1_L 16,32,15,1 | |||
| bdnz ZGEMM_L1x1_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L1x1_LOOP_END: | |||
| END1x1 AO, BO, 16, 32 | |||
| b ZGEMM_L1x1_SUB1 | |||
| ZGEMM_L1x1_SUB0: | |||
| andi. L, K, 63 | |||
| b ZGEMM_L1x1_SUB2 | |||
| ZGEMM_L1x1_SUB1: | |||
| andi. L, T1, 31 | |||
| ble ZGEMM_L1x1_SAVE | |||
| ZGEMM_L1x1_SUB2: | |||
| srawi. T1,L, 3 | |||
| ble ZGEMM_L1x1_SUB2_4 | |||
| mtctr T1 | |||
| MY_ALIGN | |||
| ZGEMM_L1x1_SUB2_LOOP: | |||
| LOAD1x1 0 | |||
| KERNEL1x1_L 16,32, 0,0 | |||
| KERNEL1x1_L 16,32, 1,0 | |||
| KERNEL1x1_L 16,32, 2,0 | |||
| KERNEL1x1_E 16,32, 3,1 | |||
| bdnz ZGEMM_L1x1_SUB2_LOOP | |||
| MY_ALIGN | |||
| ZGEMM_L1x1_SUB2_4: | |||
| andi. T1,L, 4 | |||
| ble ZGEMM_L1x1_SUB2_2 | |||
| LOAD1x1 0 | |||
| KERNEL1x1_L 16,32, 0,0 | |||
| KERNEL1x1_E 16,32, 1,1 | |||
| MY_ALIGN | |||
| ZGEMM_L1x1_SUB2_2: | |||
| andi. T1,L, 2 | |||
| ble ZGEMM_L1x1_SUB2_1 | |||
| LOAD1x1 0 | |||
| KERNEL1x1_E 16,32, 0,1 | |||
| MY_ALIGN | |||
| ZGEMM_L1x1_SUB2_1: | |||
| andi. T1,L, 1 | |||
| ble ZGEMM_L1x1_SAVE | |||
| KERNEL1x1 | |||
| ZGEMM_L1x1_SAVE: | |||
| SAVE1x1 | |||
| ZGEMM_L1x1_END: | |||
| ZGEMM_L1_END: | |||
| @@ -2251,12 +2251,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SGEMM_DEFAULT_P 640 | |||
| #define DGEMM_DEFAULT_P 128 | |||
| #define CGEMM_DEFAULT_P 640 | |||
| #define ZGEMM_DEFAULT_P 320 | |||
| #define ZGEMM_DEFAULT_P 512 | |||
| #define SGEMM_DEFAULT_Q 1408 | |||
| #define DGEMM_DEFAULT_Q 384 | |||
| #define CGEMM_DEFAULT_Q 640 | |||
| #define ZGEMM_DEFAULT_Q 640 | |||
| #define ZGEMM_DEFAULT_Q 1152 | |||
| #define SYMV_P 8 | |||