optimized dgemm and dgetrf for POWER8tags/v0.2.19^2
| @@ -332,6 +332,13 @@ typedef int blasint; | |||
| #endif | |||
| #endif | |||
| #ifdef POWER8 | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| #endif | |||
| /* | |||
| #ifdef PILEDRIVER | |||
| #ifndef YIELDING | |||
| @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define MY_ALIGN .align 3 | |||
| srawi. J, N, 2 | |||
| ble LDGEMM_L4_END | |||
| @@ -53,7 +54,7 @@ LDGEMM_L4_BEGIN: | |||
| srawi. I, M, 4 | |||
| ble LDGEMM_L4x16_END | |||
| .align 4 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_BEGIN_FIRST: | |||
| li L, -128 | |||
| @@ -90,7 +91,7 @@ LDGEMM_L4x16_BEGIN_FIRST: | |||
| cmpwi cr0, L, 1 | |||
| ble LDGEMM_L4x16_SUB4_FIRST | |||
| .align 4 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_LOOP_START_FIRST: | |||
| li T2, 512 | |||
| @@ -115,7 +116,7 @@ LDGEMM_L4x16_LOOP_START_FIRST: | |||
| ble LDGEMM_L4x16_LOOP_END_FIRST | |||
| mtctr L | |||
| .align 4 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_LOOP_FIRST: | |||
| @@ -132,7 +133,7 @@ LDGEMM_L4x16_LOOP_FIRST: | |||
| bdnz LDGEMM_L4x16_LOOP_FIRST | |||
| .align 4 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_LOOP_END_FIRST: | |||
| @@ -175,7 +176,7 @@ LDGEMM_L4x16_SUB2_FIRST: | |||
| addic. L, L, -1 | |||
| bgt LDGEMM_L4x16_SUB2_FIRST | |||
| .align 4 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_SAVE_FIRST: | |||
| SAVE4x16 | |||
| @@ -185,7 +186,8 @@ LDGEMM_L4x16_SAVE_FIRST: | |||
| LDGEMM_L4x16_END_FIRST: | |||
| .align 4 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_BEGIN: | |||
| li L, -128 | |||
| @@ -222,7 +224,8 @@ LDGEMM_L4x16_BEGIN: | |||
| cmpwi cr0, L, 1 | |||
| ble- LDGEMM_L4x16_SUB4 | |||
| .align 4 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_LOOP_START: | |||
| li o40, 40 | |||
| @@ -239,20 +242,19 @@ LDGEMM_L4x16_LOOP_START: | |||
| ble- LDGEMM_L4x16_LOOP_END | |||
| mtctr L | |||
| .align 4 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_LOOP: | |||
| dcbt AO, PRE | |||
| KERNEL4x16_L1 | |||
| dcbt AO, PRE | |||
| // addic. L, L, -1 | |||
| KERNEL4x16_L2 | |||
| bdnz+ LDGEMM_L4x16_LOOP | |||
| .align 4 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_LOOP_END: | |||
| @@ -261,6 +263,8 @@ LDGEMM_L4x16_LOOP_END: | |||
| b LDGEMM_L4x16_SUB1 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_SUB4: | |||
| KERNEL4x16_SUBI1 | |||
| @@ -268,6 +272,8 @@ LDGEMM_L4x16_SUB4: | |||
| b LDGEMM_L4x16_SUB1 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_SUB0: | |||
| andi. L, K, 1 | |||
| @@ -278,11 +284,15 @@ LDGEMM_L4x16_SUB0: | |||
| ble LDGEMM_L4x16_SAVE | |||
| b LDGEMM_L4x16_SUB2 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_SUB1: | |||
| andi. L, K, 1 | |||
| ble LDGEMM_L4x16_SAVE | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_SUB2: | |||
| KERNEL4x16_SUB1 | |||
| @@ -290,7 +300,8 @@ LDGEMM_L4x16_SUB2: | |||
| addic. L, L, -1 | |||
| bgt LDGEMM_L4x16_SUB2 | |||
| .align 4 | |||
| MY_ALIGN | |||
| LDGEMM_L4x16_SAVE: | |||
| SAVE4x16 | |||
| @@ -334,7 +345,7 @@ LDGEMM_L4x8_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L4x8_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L4x8_LOOP: | |||
| @@ -441,7 +452,7 @@ LDGEMM_L4x4_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L4x4_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L4x4_LOOP: | |||
| @@ -543,7 +554,7 @@ LDGEMM_L4x2_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L4x2_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L4x2_LOOP: | |||
| @@ -643,7 +654,7 @@ LDGEMM_L4x1_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L4x1_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L4x1_LOOP: | |||
| @@ -778,7 +789,7 @@ LDGEMM_L2x16_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L2x16_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L2x16_LOOP: | |||
| @@ -907,7 +918,7 @@ LDGEMM_L2x8_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L2x8_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L2x8_LOOP: | |||
| @@ -1011,7 +1022,7 @@ LDGEMM_L2x4_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L2x4_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L2x4_LOOP: | |||
| @@ -1111,7 +1122,7 @@ LDGEMM_L2x2_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L2x2_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L2x2_LOOP: | |||
| @@ -1211,7 +1222,7 @@ LDGEMM_L2x1_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L2x1_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L2x1_LOOP: | |||
| @@ -1331,7 +1342,7 @@ LDGEMM_L1x16_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L1x16_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L1x16_LOOP: | |||
| @@ -1460,7 +1471,7 @@ LDGEMM_L1x8_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L1x8_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L1x8_LOOP: | |||
| @@ -1564,7 +1575,7 @@ LDGEMM_L1x4_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L1x4_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L1x4_LOOP: | |||
| @@ -1664,7 +1675,7 @@ LDGEMM_L1x2_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L1x2_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L1x2_LOOP: | |||
| @@ -1764,7 +1775,7 @@ LDGEMM_L1x1_LOOP_START: | |||
| addic. L, L, -2 | |||
| ble LDGEMM_L1x1_LOOP_END | |||
| .align 5 | |||
| MY_ALIGN | |||
| LDGEMM_L1x1_LOOP: | |||
| @@ -127,6 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs62, vs7, vs15, 3 | |||
| xxpermdi vs63, vs23, vs31, 3 | |||
| dcbt BO, PREB | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| @@ -138,6 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stxvd2x vs39, o112, BO | |||
| addi BO, BO, 128 | |||
| dcbt BO, PREB | |||
| stxvd2x vs40, o0, BO | |||
| stxvd2x vs41, o16, BO | |||
| stxvd2x vs42, o32, BO | |||
| @@ -148,6 +151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stxvd2x vs47, o112, BO | |||
| addi BO, BO, 128 | |||
| dcbt BO, PREB | |||
| stxvd2x vs48, o0, BO | |||
| stxvd2x vs49, o16, BO | |||
| stxvd2x vs50, o32, BO | |||
| @@ -158,6 +163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stxvd2x vs55, o112, BO | |||
| addi BO, BO, 128 | |||
| dcbt BO, PREB | |||
| stxvd2x vs56, o0, BO | |||
| stxvd2x vs57, o16, BO | |||
| stxvd2x vs58, o32, BO | |||
| @@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add B2, B2, B | |||
| add B1, B1, B | |||
| li PREA, 256 | |||
| li PREA, 384 | |||
| addi PREB, M16, 128 | |||
| li o8, 8 | |||
| @@ -52,31 +52,31 @@ DCOPYT_L4_BEGIN: | |||
| ble DCOPYT_L4x8_BEGIN | |||
| mr BO, B16 | |||
| addi T2, M16, 384 | |||
| mtctr J | |||
| .align 5 | |||
| DCOPYT_L4x16_LOOP: | |||
| /* | |||
| addi T1, PREB, 128 | |||
| addi T2, PREB, 256 | |||
| */ | |||
| addi T1, M16, 256 | |||
| dcbt A0, PREA | |||
| dcbt A1, PREA | |||
| dcbt A2, PREA | |||
| dcbt A3, PREA | |||
| /* | |||
| dcbtst BO, M16 | |||
| dcbtst BO, PREB | |||
| dcbtst BO, T1 | |||
| dcbtst BO, T2 | |||
| */ | |||
| dcbt BO, M16 | |||
| dcbt BO, PREB | |||
| dcbt BO, T1 | |||
| dcbt BO, T2 | |||
| COPY_4x16 | |||
| add BO, BO, M16 | |||
| addic. J, J, -1 | |||
| bgt DCOPYT_L4x16_LOOP | |||
| // addic. J, J, -1 | |||
| bdnz+ DCOPYT_L4x16_LOOP | |||
| DCOPYT_L4x8_BEGIN: | |||
| @@ -46,52 +46,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvd2x vs35, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs36, o0, A0 | |||
| lxvd2x vs37, o16, A0 | |||
| lxvd2x vs38, o32, A0 | |||
| lxvd2x vs39, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs40, o0, A1 | |||
| lxvd2x vs41, o16, A1 | |||
| lxvd2x vs42, o32, A1 | |||
| lxvd2x vs43, o48, A1 | |||
| addi A1, A1, 64 | |||
| lxvd2x vs44, o0, A1 | |||
| lxvd2x vs45, o16, A1 | |||
| lxvd2x vs46, o32, A1 | |||
| lxvd2x vs47, o48, A1 | |||
| addi A1, A1, 64 | |||
| lxvd2x vs48, o0, A2 | |||
| lxvd2x vs49, o16, A2 | |||
| lxvd2x vs50, o32, A2 | |||
| lxvd2x vs51, o48, A2 | |||
| addi A2, A2, 64 | |||
| lxvd2x vs52, o0, A2 | |||
| lxvd2x vs53, o16, A2 | |||
| lxvd2x vs54, o32, A2 | |||
| lxvd2x vs55, o48, A2 | |||
| addi A2, A2, 64 | |||
| lxvd2x vs56, o0, A3 | |||
| lxvd2x vs57, o16, A3 | |||
| lxvd2x vs58, o32, A3 | |||
| lxvd2x vs59, o48, A3 | |||
| addi A3, A3, 64 | |||
| lxvd2x vs36, o0, A0 | |||
| lxvd2x vs37, o16, A0 | |||
| lxvd2x vs38, o32, A0 | |||
| lxvd2x vs39, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs44, o0, A1 | |||
| lxvd2x vs45, o16, A1 | |||
| lxvd2x vs46, o32, A1 | |||
| lxvd2x vs47, o48, A1 | |||
| addi A1, A1, 64 | |||
| lxvd2x vs52, o0, A2 | |||
| lxvd2x vs53, o16, A2 | |||
| lxvd2x vs54, o32, A2 | |||
| lxvd2x vs55, o48, A2 | |||
| addi A2, A2, 64 | |||
| lxvd2x vs60, o0, A3 | |||
| lxvd2x vs61, o16, A3 | |||
| lxvd2x vs62, o32, A3 | |||
| lxvd2x vs63, o48, A3 | |||
| addi A3, A3, 64 | |||
| mr T1, BO | |||
| stxvd2x vs32, o0, T1 | |||
| @@ -173,10 +173,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); | |||
| if (blocking > GEMM_Q) blocking = GEMM_Q; | |||
| if (blocking <= GEMM_UNROLL_N * 2) { | |||
| #ifdef POWER8 | |||
| if (blocking <= GEMM_UNROLL_N) { | |||
| info = GETF2(args, NULL, range_n, sa, sb, 0); | |||
| return info; | |||
| } | |||
| #else | |||
| if (blocking <= GEMM_UNROLL_N*2) { | |||
| info = GETF2(args, NULL, range_n, sa, sb, 0); | |||
| return info; | |||
| } | |||
| #endif | |||
| sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
| @@ -77,10 +77,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); | |||
| if (blocking > GEMM_Q) blocking = GEMM_Q; | |||
| #ifdef POWER8 | |||
| if (blocking <= GEMM_UNROLL_N) { | |||
| info = GETF2(args, NULL, range_n, sa, sb, 0); | |||
| return info; | |||
| } | |||
| #else | |||
| if (blocking <= GEMM_UNROLL_N * 2) { | |||
| info = GETF2(args, NULL, range_n, sa, sb, 0); | |||
| return info; | |||
| } | |||
| #endif | |||
| sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||