| @@ -207,7 +207,7 @@ int main(int argc, char *argv[]){ | |||||
| for (i = 0; i < m * n * COMPSIZE; i++) { | for (i = 0; i < m * n * COMPSIZE; i++) { | ||||
| c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | ||||
| } | } | ||||
| fprintf(stderr, " SIZE Flops Time\n"); | fprintf(stderr, " SIZE Flops Time\n"); | ||||
| for (i = from; i <= to; i += step) { | for (i = from; i <= to; i += step) { | ||||
| @@ -42,7 +42,7 @@ ZGEMMKERNEL = zgemm_kernel_power9.S | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | ||||
| ZGEMMITCOPY = zgemm_tcopy_8_power8.S | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | ZGEMMONCOPYOBJ = zgemm_oncopy.o | ||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | ZGEMMOTCOPYOBJ = zgemm_otcopy.o | ||||
| ZGEMMINCOPYOBJ = zgemm_incopy.o | ZGEMMINCOPYOBJ = zgemm_incopy.o | ||||
| @@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| std r14, 280(SP) | std r14, 280(SP) | ||||
| stxv v20, 288(SP) | |||||
| stxv v21, 304(SP) | |||||
| stxv v22, 320(SP) | |||||
| stxv v23, 336(SP) | |||||
| stxv v24, 352(SP) | |||||
| stxv v25, 368(SP) | |||||
| stxv v26, 384(SP) | |||||
| stxv v27, 400(SP) | |||||
| stxv v28, 416(SP) | |||||
| stxv v29, 432(SP) | |||||
| stxv v30, 448(SP) | |||||
| stxv v31, 464(SP) | |||||
| stxv vs52, 288(SP) | |||||
| stxv vs53, 304(SP) | |||||
| stxv vs54, 320(SP) | |||||
| stxv vs55, 336(SP) | |||||
| stxv vs56, 352(SP) | |||||
| stxv vs57, 368(SP) | |||||
| stxv vs58, 384(SP) | |||||
| stxv vs59, 400(SP) | |||||
| stxv vs60, 416(SP) | |||||
| stxv vs61, 432(SP) | |||||
| stxv vs62, 448(SP) | |||||
| stxv vs63, 464(SP) | |||||
| stfd f1, ALPHA_SP | stfd f1, ALPHA_SP | ||||
| @@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld r15, 272(SP) | ld r15, 272(SP) | ||||
| ld r14, 280(SP) | ld r14, 280(SP) | ||||
| lxv v20, 288(SP) | |||||
| lxv v21, 304(SP) | |||||
| lxv v22, 320(SP) | |||||
| lxv v23, 336(SP) | |||||
| lxv v24, 352(SP) | |||||
| lxv v25, 368(SP) | |||||
| lxv v26, 384(SP) | |||||
| lxv v27, 400(SP) | |||||
| lxv v28, 416(SP) | |||||
| lxv v29, 432(SP) | |||||
| lxv v30, 448(SP) | |||||
| lxv v31, 464(SP) | |||||
| lxv vs52, 288(SP) | |||||
| lxv vs53, 304(SP) | |||||
| lxv vs54, 320(SP) | |||||
| lxv vs55, 336(SP) | |||||
| lxv vs56, 352(SP) | |||||
| lxv vs57, 368(SP) | |||||
| lxv vs58, 384(SP) | |||||
| lxv vs59, 400(SP) | |||||
| lxv vs60, 416(SP) | |||||
| lxv vs61, 432(SP) | |||||
| lxv vs62, 448(SP) | |||||
| lxv vs63, 464(SP) | |||||
| addi SP, SP, STACKSIZE | addi SP, SP, STACKSIZE | ||||
| blr | blr | ||||
| @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define LOAD ld | #define LOAD ld | ||||
| #define STACKSIZE (512 ) | #define STACKSIZE (512 ) | ||||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||||
| #define M r3 | #define M r3 | ||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| @@ -91,7 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROFCODE | PROFCODE | ||||
| addi SP, SP, -STACKSIZE | addi SP, SP, -STACKSIZE | ||||
| li r0, 0 | |||||
| mflr r0 | |||||
| stfd f14, 0(SP) | stfd f14, 0(SP) | ||||
| stfd f15, 8(SP) | stfd f15, 8(SP) | ||||
| @@ -137,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| std r14, 280(SP) | std r14, 280(SP) | ||||
| stxv v20, 288(SP) | |||||
| stxv v21, 304(SP) | |||||
| stxv v22, 320(SP) | |||||
| stxv v23, 336(SP) | |||||
| stxv v24, 352(SP) | |||||
| stxv v25, 368(SP) | |||||
| stxv v26, 384(SP) | |||||
| stxv v27, 400(SP) | |||||
| stxv v28, 416(SP) | |||||
| stxv v29, 432(SP) | |||||
| stxv v30, 448(SP) | |||||
| stxv v31, 464(SP) | |||||
| stxv vs52, 288(SP) | |||||
| stxv vs53, 304(SP) | |||||
| stxv vs54, 320(SP) | |||||
| stxv vs55, 336(SP) | |||||
| stxv vs56, 352(SP) | |||||
| stxv vs57, 368(SP) | |||||
| stxv vs58, 384(SP) | |||||
| stxv vs59, 400(SP) | |||||
| stxv vs60, 416(SP) | |||||
| stxv vs61, 432(SP) | |||||
| stxv vs62, 448(SP) | |||||
| stxv vs63, 464(SP) | |||||
| std r0, FLINK_SAVE(SP) | |||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| @@ -157,72 +158,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| slwi LDC, LDC, 2 | slwi LDC, LDC, 2 | ||||
| /* cmpwi cr0, M, 0 | |||||
| ble .L999_H1 | |||||
| cmpwi cr0, N, 0 | |||||
| ble .L999_H1 | |||||
| cmpwi cr0, K, 0 | |||||
| ble .L999_H1 | |||||
| */ | |||||
| /*alpha is stored in f1. convert to single and splat*/ | /*alpha is stored in f1. convert to single and splat*/ | ||||
| xscvdpspn alpha_r,vs1 | |||||
| xxspltw alpha_r,alpha_r,0 | |||||
| xscvdpspn alpha_r,vs1 | |||||
| xxspltw alpha_r,alpha_r,0 | |||||
| /*load reverse permute mask for big endian | /*load reverse permute mask for big endian | ||||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | uint128 = 0xc0d0e0f08090a0b0405060700010203 | ||||
| */ | */ | ||||
| lis T2, perm_const2@highest | lis T2, perm_const2@highest | ||||
| ori T2, T2, perm_const2@higher | |||||
| rldicr T2, T2, 32, 31 | |||||
| oris T2, T2, perm_const2@h | |||||
| ori T2, T2, perm_const2@l | |||||
| lis T1, perm_const1@highest | lis T1, perm_const1@highest | ||||
| lis T3, save_permute_12@highest | |||||
| lis T4, save_permute_11@highest | |||||
| lis T5, save_permute_22@highest | |||||
| lis T6, save_permute_21@highest | |||||
| ori T2, T2, perm_const2@higher | |||||
| ori T1, T1, perm_const1@higher | ori T1, T1, perm_const1@higher | ||||
| ori T3, T3, save_permute_12@higher | |||||
| ori T4, T4, save_permute_11@higher | |||||
| ori T5, T5, save_permute_22@higher | |||||
| ori T6, T6, save_permute_21@higher | |||||
| rldicr T2, T2, 32, 31 | |||||
| rldicr T1, T1, 32, 31 | rldicr T1, T1, 32, 31 | ||||
| rldicr T3, T3, 32, 31 | |||||
| rldicr T4, T4, 32, 31 | |||||
| rldicr T5, T5, 32, 31 | |||||
| rldicr T6, T6, 32, 31 | |||||
| oris T2, T2, perm_const2@h | |||||
| oris T1, T1, perm_const1@h | oris T1, T1, perm_const1@h | ||||
| oris T3, T3, save_permute_12@h | |||||
| oris T4, T4, save_permute_11@h | |||||
| oris T5, T5, save_permute_22@h | |||||
| oris T6, T6, save_permute_21@h | |||||
| ori T2, T2, perm_const2@l | |||||
| ori T1, T1, perm_const1@l | ori T1, T1, perm_const1@l | ||||
| ori T3, T3, save_permute_12@l | |||||
| ori T4, T4, save_permute_11@l | |||||
| ori T5, T5, save_permute_22@l | |||||
| ori T6, T6, save_permute_21@l | |||||
| li r0,0 | |||||
| mtvsrdd permute_mask,T2,T1 | mtvsrdd permute_mask,T2,T1 | ||||
| lis T2, save_permute_12@highest | |||||
| ori T2, T2, save_permute_12@higher | |||||
| rldicr T2, T2, 32, 31 | |||||
| oris T2, T2, save_permute_12@h | |||||
| ori T2, T2, save_permute_12@l | |||||
| lis T1, save_permute_11@highest | |||||
| ori T1, T1, save_permute_11@higher | |||||
| rldicr T1, T1, 32, 31 | |||||
| oris T1, T1, save_permute_11@h | |||||
| ori T1, T1, save_permute_11@l | |||||
| mtvsrdd save_permute_1,T2,T1 | |||||
| lis T2, save_permute_22@highest | |||||
| ori T2, T2, save_permute_22@higher | |||||
| rldicr T2, T2, 32, 31 | |||||
| oris T2, T2, save_permute_22@h | |||||
| ori T2, T2, save_permute_22@l | |||||
| lis T1, save_permute_21@highest | |||||
| ori T1, T1, save_permute_21@higher | |||||
| rldicr T1, T1, 32, 31 | |||||
| oris T1, T1, save_permute_21@h | |||||
| ori T1, T1, save_permute_21@l | |||||
| mtvsrdd save_permute_2,T2,T1 | |||||
| mtvsrdd save_permute_1,T3,T4 | |||||
| mtvsrdd save_permute_2,T5,T6 | |||||
| #include "sgemm_logic_power9.S" | #include "sgemm_logic_power9.S" | ||||
| .L999: | |||||
| addi r3, 0, 0 | |||||
| .L999: | |||||
| lfd f14, 0(SP) | lfd f14, 0(SP) | ||||
| lfd f15, 8(SP) | lfd f15, 8(SP) | ||||
| lfd f16, 16(SP) | lfd f16, 16(SP) | ||||
| @@ -264,23 +247,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld r16, 264(SP) | ld r16, 264(SP) | ||||
| ld r15, 272(SP) | ld r15, 272(SP) | ||||
| ld r14, 280(SP) | ld r14, 280(SP) | ||||
| lxv v20, 288(SP) | |||||
| lxv v21, 304(SP) | |||||
| lxv v22, 320(SP) | |||||
| lxv v23, 336(SP) | |||||
| lxv v24, 352(SP) | |||||
| lxv v25, 368(SP) | |||||
| lxv v26, 384(SP) | |||||
| lxv v27, 400(SP) | |||||
| lxv v28, 416(SP) | |||||
| lxv v29, 432(SP) | |||||
| lxv v30, 448(SP) | |||||
| lxv v31, 464(SP) | |||||
| ld r0, FLINK_SAVE(SP) | |||||
| addi SP, SP, STACKSIZE | |||||
| lxv vs52, 288(SP) | |||||
| lxv vs53, 304(SP) | |||||
| lxv vs54, 320(SP) | |||||
| lxv vs55, 336(SP) | |||||
| lxv vs56, 352(SP) | |||||
| lxv vs57, 368(SP) | |||||
| lxv vs58, 384(SP) | |||||
| lxv vs59, 400(SP) | |||||
| mtlr r0 | |||||
| lxv vs60, 416(SP) | |||||
| lxv vs61, 432(SP) | |||||
| lxv vs62, 448(SP) | |||||
| lxv vs63, 464(SP) | |||||
| addi SP, SP, STACKSIZE | |||||
| blr | blr | ||||
| EPILOGUE | EPILOGUE | ||||
| #endif | #endif | ||||
| @@ -1,5 +1,94 @@ | |||||
| #define MY_ALIGN .align 3 | #define MY_ALIGN .align 3 | ||||
| b L8 | |||||
| MY_ALIGN | |||||
| LSGEMM_L8x16_LMAIN_SUB: | |||||
| LOAD8x16_0 | |||||
| mtctr L | |||||
| MY_ALIGN | |||||
| LSGEMM_L8x16_LOOP: | |||||
| KERNEL8x16_I1_L4_2 64,32, 0,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 1,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 2,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 3,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 4,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 5,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 6,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 7,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 8,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 9,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 10,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 11,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 12,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 13,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 14,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 15,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 16,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 17,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 18,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 19,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 20,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 21,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 22,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 23,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 24,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 25,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 26,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 27,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 28,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 29,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 30,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 31,1 | |||||
| bdnz LSGEMM_L8x16_LOOP | |||||
| MY_ALIGN | |||||
| LSGEMM_L8x16_LOOP_END: | |||||
| END8x16 0, AO, BO, 64, 32 | |||||
| blr | |||||
| MY_ALIGN | |||||
| LSGEMM_L8x16_L64_SUB: | |||||
| LOAD8x16_0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 0,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 1,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 2,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 3,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 4,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 5,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 6,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 7,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 8,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 9,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 10,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 11,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 12,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 13,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 14,0 | |||||
| KERNEL8x16_I1_L4_3 64,32, 15,1 | |||||
| blr | |||||
| LSGEMM_L8x16_L32_SUB: | |||||
| LOAD8x16_0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 0,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 1,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 2,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 3,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 4,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 5,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 6,0 | |||||
| KERNEL8x16_I1_L4_3 64,32, 7,1 | |||||
| blr | |||||
| LSGEMM_L8x16_L16_SUB: | |||||
| LOAD8x16_0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 0,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 1,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 2,0 | |||||
| KERNEL8x16_I1_L4_3 64,32, 3,1 | |||||
| blr | |||||
| L8: | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| neg TEMP_REG, OFFSET | neg TEMP_REG, OFFSET | ||||
| #endif | #endif | ||||
| @@ -39,98 +128,50 @@ LSGEMM_L8x16_BEGIN: | |||||
| REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 | REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 | ||||
| mr T12, T11 | mr T12, T11 | ||||
| addi T12,T12, -1 | addi T12,T12, -1 | ||||
| srawi. L, T12, 6 /**(T11-1) % 64x */ | |||||
| srawi. L, T12, 7 /**(T11-1) % 128x */ | |||||
| #else | #else | ||||
| mr T12, K | mr T12, K | ||||
| addi T12,T12, -1 | addi T12,T12, -1 | ||||
| srawi. L, T12, 6 /**(K-1) % 64x */ | |||||
| srawi. L, T12, 7 /**(K-1) % 128x */ | |||||
| #endif | #endif | ||||
| ZERO8x16 | ZERO8x16 | ||||
| ble LSGEMM_L8x16_SUB0 | ble LSGEMM_L8x16_SUB0 | ||||
| MY_ALIGN | |||||
| LSGEMM_L8x16_LOOP_START: | |||||
| LOAD8x16_0 /*we already zeroed */ | |||||
| /*##OffsetA=64 OffsetB=32 | |||||
| #addi AO,AO,2112 | |||||
| #addi BO,BO,32 */ | |||||
| mtctr L | |||||
| MY_ALIGN | |||||
| LSGEMM_L8x16_LOOP: | |||||
| KERNEL8x16_I1_L4_2 64,32, 0,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 1,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 2,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 3,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 4,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 5,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 6,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 7,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 8,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 9,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 10,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 11,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 12,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 13,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 14,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 15,1 | |||||
| bdnz LSGEMM_L8x16_LOOP | |||||
| MY_ALIGN | |||||
| LSGEMM_L8x16_LOOP_END: | |||||
| END8x16 0, AO, BO, 64, 32 | |||||
| b LSGEMM_L8x16_SUB1 | |||||
| bl LSGEMM_L8x16_LMAIN_SUB | |||||
| andi. L, T12, 127 | |||||
| ble LSGEMM_L8x16_SAVE | |||||
| b LSGEMM_L8x16_SUB2 | |||||
| MY_ALIGN | MY_ALIGN | ||||
| LSGEMM_L8x16_SUB0: | LSGEMM_L8x16_SUB0: | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| andi. L, T11, 127 | |||||
| andi. L, T11, 255 | |||||
| cmpwi T11,128 | |||||
| #else | #else | ||||
| andi. L, K, 127 | |||||
| andi. L, K, 255 | |||||
| cmpwi K,128 | |||||
| #endif | #endif | ||||
| b LSGEMM_L8x16_SUB2 | |||||
| MY_ALIGN | |||||
| LSGEMM_L8x16_SUB1: | |||||
| #if defined(TRMMKERNEL) | |||||
| andi. L, T12, 63 | |||||
| #else | |||||
| andi. L, T12, 63 | |||||
| #endif | |||||
| ble LSGEMM_L8x16_SAVE | |||||
| bne LSGEMM_L8x16_SUB2 | |||||
| MY_ALIGN | |||||
| LSGEMM_L8x16_SUB2_128: | |||||
| bl LSGEMM_L8x16_L64_SUB | |||||
| bl LSGEMM_L8x16_L64_SUB | |||||
| b LSGEMM_L8x16_SAVE | |||||
| MY_ALIGN | MY_ALIGN | ||||
| LSGEMM_L8x16_SUB2: | LSGEMM_L8x16_SUB2: | ||||
| srawi. T10,L, 5 | |||||
| andi. T10,L,64 | |||||
| ble LSGEMM_L8x16_SUB2_32 | |||||
| bl LSGEMM_L8x16_L64_SUB | |||||
| MY_ALIGN | |||||
| LSGEMM_L8x16_SUB2_32: | |||||
| andi. T10,L, 32 | |||||
| ble LSGEMM_L8x16_SUB2_16 | ble LSGEMM_L8x16_SUB2_16 | ||||
| mtctr T10 | |||||
| MY_ALIGN | |||||
| LSGEMM_L8x16_SUB2_LOOP: | |||||
| LOAD8x16_0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 0,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 1,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 2,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 3,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 4,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 5,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 6,0 | |||||
| KERNEL8x16_I1_L4_3 64,32, 7,1 | |||||
| bdnz LSGEMM_L8x16_SUB2_LOOP | |||||
| MY_ALIGN | |||||
| bl LSGEMM_L8x16_L32_SUB | |||||
| MY_ALIGN | |||||
| LSGEMM_L8x16_SUB2_16: | LSGEMM_L8x16_SUB2_16: | ||||
| andi. T10,L, 16 | andi. T10,L, 16 | ||||
| ble LSGEMM_L8x16_SUB2_8 | ble LSGEMM_L8x16_SUB2_8 | ||||
| LOAD8x16_0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 0,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 1,0 | |||||
| KERNEL8x16_I1_L4_2 64,32, 2,0 | |||||
| KERNEL8x16_I1_L4_3 64,32, 3,1 | |||||
| bl LSGEMM_L8x16_L16_SUB | |||||
| MY_ALIGN | MY_ALIGN | ||||
| LSGEMM_L8x16_SUB2_8: | LSGEMM_L8x16_SUB2_8: | ||||
| andi. T10,L, 8 | andi. T10,L, 8 | ||||
| @@ -155,8 +196,7 @@ LSGEMM_L8x16_SUB2_1: | |||||
| andi. T10,L, 1 | andi. T10,L, 1 | ||||
| ble LSGEMM_L8x16_SAVE | ble LSGEMM_L8x16_SAVE | ||||
| KERNEL8x16 0 | KERNEL8x16 0 | ||||
| # addic. L, L, -1 | |||||
| # bgt LSGEMM_L8x16_SUB2 | |||||
| MY_ALIGN | MY_ALIGN | ||||
| LSGEMM_L8x16_SAVE: | LSGEMM_L8x16_SAVE: | ||||
| @@ -30,10 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define LOAD ld | #define LOAD ld | ||||
| #define STACKSIZE 32192 | |||||
| #define STACKSIZE 512 | |||||
| #define FZERO 312+192(SP) | #define FZERO 312+192(SP) | ||||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||||
| #define M r3 | #define M r3 | ||||
| #define N r4 | #define N r4 | ||||
| @@ -56,20 +57,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define FRAMEPOINTER r12 | #define FRAMEPOINTER r12 | ||||
| #define BBUFFER r14 | |||||
| #define T10 r14 | |||||
| #define L r15 | #define L r15 | ||||
| #define ALPHA r16 | |||||
| #define T8 r16 | |||||
| #define T5 r17 | #define T5 r17 | ||||
| #define T2 r19 | #define T2 r19 | ||||
| #define BBO r20 | |||||
| #define o8 r21 | |||||
| #define T9 r20 | |||||
| #define T6 r21 | |||||
| #define I r22 | #define I r22 | ||||
| #define J r23 | #define J r23 | ||||
| #define AO r24 | #define AO r24 | ||||
| #define BO r25 | #define BO r25 | ||||
| #define CO r26 | #define CO r26 | ||||
| #define o16 r27 | |||||
| #define T7 r27 | |||||
| #define T3 r28 | #define T3 r28 | ||||
| #define T4 r29 | #define T4 r29 | ||||
| @@ -82,12 +83,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROFCODE | PROFCODE | ||||
| mr FRAMEPOINTER, SP | mr FRAMEPOINTER, SP | ||||
| addi SP, SP, -STACKSIZE | |||||
| addi SP, SP, -STACKSIZE | |||||
| addi SP, SP, -STACKSIZE | |||||
| addi SP, SP, -STACKSIZE | |||||
| li r0, 0 | |||||
| addi SP, SP, -STACKSIZE | |||||
| mflr r0 | |||||
| stfd f14, 0(SP) | stfd f14, 0(SP) | ||||
| stfd f15, 8(SP) | stfd f15, 8(SP) | ||||
| stfd f16, 16(SP) | stfd f16, 16(SP) | ||||
| @@ -111,6 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stfd f30, 128(SP) | stfd f30, 128(SP) | ||||
| stfd f31, 136(SP) | stfd f31, 136(SP) | ||||
| xxspltd alpha_r,vs1,0 /*copy from register f1 */ | |||||
| xxspltd alpha_i,vs2,0 /*copy from register f2 */ | |||||
| std r31, 144(SP) | std r31, 144(SP) | ||||
| std r30, 152(SP) | std r30, 152(SP) | ||||
| @@ -132,21 +131,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| std r14, 280(SP) | std r14, 280(SP) | ||||
| stxv v20, 288(SP) | |||||
| stxv v21, 304(SP) | |||||
| stxv v22, 320(SP) | |||||
| stxv v23, 336(SP) | |||||
| stxv v24, 352(SP) | |||||
| stxv v25, 368(SP) | |||||
| stxv v26, 384(SP) | |||||
| stxv v27, 400(SP) | |||||
| stxv v28, 416(SP) | |||||
| stxv v29, 432(SP) | |||||
| stxv v30, 448(SP) | |||||
| stxv v31, 464(SP) | |||||
| stxv vs52, 288(SP) | |||||
| stxv vs53, 304(SP) | |||||
| stxv vs54, 320(SP) | |||||
| stxv vs55, 336(SP) | |||||
| stxv vs56, 352(SP) | |||||
| stxv vs57, 368(SP) | |||||
| stxv vs58, 384(SP) | |||||
| stxv vs59, 400(SP) | |||||
| stxv vs60, 416(SP) | |||||
| stxv vs61, 432(SP) | |||||
| stxv vs62, 448(SP) | |||||
| stxv vs63, 464(SP) | |||||
| std r0, FLINK_SAVE(SP) | |||||
| stw r0, FZERO | |||||
| #ifdef linux | #ifdef linux | ||||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | ||||
| @@ -162,35 +161,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "zgemm_macros_power9.S" | #include "zgemm_macros_power9.S" | ||||
| cmpwi cr0, M, 0 | |||||
| ble L999 | |||||
| cmpwi cr0, N, 0 | |||||
| ble L999 | |||||
| cmpwi cr0, K, 0 | |||||
| ble L999 | |||||
| slwi LDC, LDC, ZBASE_SHIFT | slwi LDC, LDC, ZBASE_SHIFT | ||||
| li PRE, 512 | |||||
| li o8 , 8 | |||||
| li o16 , 16 | |||||
| addi BBUFFER, SP, 512+4096 | |||||
| li T1, -4096 | |||||
| and BBUFFER, BBUFFER, T1 | |||||
| addi ALPHA, SP, 296+192 | |||||
| li PRE, 512 | |||||
| li r0, 0 | |||||
| xxlor alpha_r,vs1,vs1 /*copy from register f1 */ | |||||
| xxlor alpha_i,vs2,vs2 /*copy from register f2 */ | |||||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||||
| /*negate for this case as we will use addition -1*(a+b) */ | |||||
| xvnegdp alpha_r,alpha_r | |||||
| xvnegdp alpha_i,alpha_i | |||||
| #endif | |||||
| .align 4 | .align 4 | ||||
| #include "zgemm_logic_power9.S" | #include "zgemm_logic_power9.S" | ||||
| L999: | L999: | ||||
| addi r3, 0, 0 | |||||
| lfd f14, 0(SP) | lfd f14, 0(SP) | ||||
| lfd f15, 8(SP) | lfd f15, 8(SP) | ||||
| lfd f16, 16(SP) | lfd f16, 16(SP) | ||||
| @@ -233,24 +221,24 @@ L999: | |||||
| ld r16, 264(SP) | ld r16, 264(SP) | ||||
| ld r15, 272(SP) | ld r15, 272(SP) | ||||
| ld r14, 280(SP) | ld r14, 280(SP) | ||||
| ld r0, FLINK_SAVE(SP) | |||||
| lxv v20, 288(SP) | |||||
| lxv v21, 304(SP) | |||||
| lxv v22, 320(SP) | |||||
| lxv v23, 336(SP) | |||||
| lxv v24, 352(SP) | |||||
| lxv v25, 368(SP) | |||||
| lxv v26, 384(SP) | |||||
| lxv v27, 400(SP) | |||||
| lxv v28, 416(SP) | |||||
| lxv v29, 432(SP) | |||||
| lxv v30, 448(SP) | |||||
| lxv v31, 464(SP) | |||||
| addi SP, SP, STACKSIZE | |||||
| addi SP, SP, STACKSIZE | |||||
| addi SP, SP, STACKSIZE | |||||
| addi SP, SP, STACKSIZE | |||||
| lxv vs52, 288(SP) | |||||
| lxv vs53, 304(SP) | |||||
| lxv vs54, 320(SP) | |||||
| lxv vs55, 336(SP) | |||||
| lxv vs56, 352(SP) | |||||
| lxv vs57, 368(SP) | |||||
| lxv vs58, 384(SP) | |||||
| lxv vs59, 400(SP) | |||||
| mtlr r0 | |||||
| lxv vs60, 416(SP) | |||||
| lxv vs61, 432(SP) | |||||
| lxv vs62, 448(SP) | |||||
| lxv vs63, 464(SP) | |||||
| addi SP, SP, STACKSIZE | |||||
| blr | blr | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -2248,15 +2248,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | #define ZGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | #define ZGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define SGEMM_DEFAULT_P 640 | |||||
| #define SGEMM_DEFAULT_P 832 | |||||
| #define DGEMM_DEFAULT_P 128 | #define DGEMM_DEFAULT_P 128 | ||||
| #define CGEMM_DEFAULT_P 640 | #define CGEMM_DEFAULT_P 640 | ||||
| #define ZGEMM_DEFAULT_P 512 | |||||
| #define ZGEMM_DEFAULT_P 256 | |||||
| #define SGEMM_DEFAULT_Q 1408 | |||||
| #define SGEMM_DEFAULT_Q 1025 | |||||
| #define DGEMM_DEFAULT_Q 384 | #define DGEMM_DEFAULT_Q 384 | ||||
| #define CGEMM_DEFAULT_Q 640 | #define CGEMM_DEFAULT_Q 640 | ||||
| #define ZGEMM_DEFAULT_Q 1152 | |||||
| #define ZGEMM_DEFAULT_Q 1025 | |||||
| #define SYMV_P 8 | #define SYMV_P 8 | ||||