optimized dgemm on power8 for 20 threadstags/v0.2.19^2
| @@ -13,10 +13,10 @@ endif | |||||
| ifeq ($(CORE), POWER8) | ifeq ($(CORE), POWER8) | ||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | ||||
| else | else | ||||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math | |||||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math | FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math | ||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -803,7 +803,7 @@ Lmcount$lazy_ptr: | |||||
| #elif defined(PPC440FP2) | #elif defined(PPC440FP2) | ||||
| #define BUFFER_SIZE ( 16 << 20) | #define BUFFER_SIZE ( 16 << 20) | ||||
| #elif defined(POWER8) | #elif defined(POWER8) | ||||
| #define BUFFER_SIZE ( 32 << 20) | |||||
| #define BUFFER_SIZE ( 64 << 20) | |||||
| #else | #else | ||||
| #define BUFFER_SIZE ( 16 << 20) | #define BUFFER_SIZE ( 16 << 20) | ||||
| #endif | #endif | ||||
| @@ -39,13 +39,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| LDGEMM_L4_BEGIN: | LDGEMM_L4_BEGIN: | ||||
| mr CO, C | |||||
| li T1, 128 | |||||
| li T2, 256 | |||||
| mr AO, A | mr AO, A | ||||
| slwi T1, LDC , 2 | |||||
| add C, C, T1 | |||||
| mr CO, C | |||||
| slwi T3, LDC , 2 | |||||
| add C, C, T3 | |||||
| dcbt A, T1 | |||||
| dcbt A, T2 | |||||
| srawi. I, M, 4 | srawi. I, M, 4 | ||||
| ble LDGEMM_L4x16_END | ble LDGEMM_L4x16_END | ||||
| .align 4 | |||||
| LDGEMM_L4x16_BEGIN_FIRST: | |||||
| li L, -128 | |||||
| mr T1, CO | |||||
| add T2, T1, LDC | |||||
| add T3, T2, LDC | |||||
| add T4, T3, LDC | |||||
| and T1, T1, L | |||||
| and T2, T2, L | |||||
| and T3, T3, L | |||||
| and T4, T4, L | |||||
| dcbt T1, r0 | |||||
| dcbt T2, r0 | |||||
| dcbt T3, r0 | |||||
| dcbt T4, r0 | |||||
| mr BO, B | |||||
| srawi. L, K, 2 | |||||
| addi T1, T1, 128 | |||||
| addi T2, T2, 128 | |||||
| addi T3, T3, 128 | |||||
| addi T4, T4, 128 | |||||
| dcbt T1, r0 | |||||
| dcbt T2, r0 | |||||
| dcbt T3, r0 | |||||
| dcbt T4, r0 | |||||
| ble LDGEMM_L4x16_SUB0_FIRST | |||||
| cmpwi cr0, L, 1 | |||||
| ble LDGEMM_L4x16_SUB4_FIRST | |||||
| .align 4 | |||||
| LDGEMM_L4x16_LOOP_START_FIRST: | |||||
| li T2, 512 | |||||
| li o40, 40 | |||||
| li o56, 56 | |||||
| dcbt AO, PRE | |||||
| dcbt BO, T2 | |||||
| LOAD4x16_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL4x16_I1 | |||||
| dcbt AO, PRE | |||||
| addic. L, L, -2 | |||||
| KERNEL4x16_L2 | |||||
| dcbt AO, PRE | |||||
| KERNEL4x16_L1 | |||||
| dcbt AO, PRE | |||||
| dcbt BO, T2 | |||||
| KERNEL4x16_L2 | |||||
| ble LDGEMM_L4x16_LOOP_END_FIRST | |||||
| mtctr L | |||||
| .align 4 | |||||
| LDGEMM_L4x16_LOOP_FIRST: | |||||
| dcbt AO, PRE | |||||
| KERNEL4x16_L1 | |||||
| dcbt AO, PRE | |||||
| KERNEL4x16_L2 | |||||
| dcbt AO, PRE | |||||
| KERNEL4x16_L1 | |||||
| dcbt AO, PRE | |||||
| dcbt BO, T2 | |||||
| KERNEL4x16_L2 | |||||
| bdnz LDGEMM_L4x16_LOOP_FIRST | |||||
| .align 4 | |||||
| LDGEMM_L4x16_LOOP_END_FIRST: | |||||
| KERNEL4x16_L1 | |||||
| KERNEL4x16_L2 | |||||
| KERNEL4x16_1 | |||||
| KERNEL4x16_E2 | |||||
| b LDGEMM_L4x16_SUB1_FIRST | |||||
| LDGEMM_L4x16_SUB4_FIRST: | |||||
| KERNEL4x16_SUBI1 | |||||
| KERNEL4x16_SUB1 | |||||
| KERNEL4x16_SUB1 | |||||
| KERNEL4x16_SUB1 | |||||
| b LDGEMM_L4x16_SUB1_FIRST | |||||
| LDGEMM_L4x16_SUB0_FIRST: | |||||
| andi. L, K, 3 | |||||
| KERNEL4x16_SUBI1 | |||||
| addic. L, L, -1 | |||||
| ble LDGEMM_L4x16_SAVE_FIRST | |||||
| b LDGEMM_L4x16_SUB2_FIRST | |||||
| LDGEMM_L4x16_SUB1_FIRST: | |||||
| andi. L, K, 3 | |||||
| ble LDGEMM_L4x16_SAVE_FIRST | |||||
| LDGEMM_L4x16_SUB2_FIRST: | |||||
| KERNEL4x16_SUB1 | |||||
| addic. L, L, -1 | |||||
| bgt LDGEMM_L4x16_SUB2_FIRST | |||||
| .align 4 | |||||
| LDGEMM_L4x16_SAVE_FIRST: | |||||
| SAVE4x16 | |||||
| addic. I, I, -1 | |||||
| ble LDGEMM_L4x16_END | |||||
| LDGEMM_L4x16_END_FIRST: | |||||
| .align 4 | .align 4 | ||||
| LDGEMM_L4x16_BEGIN: | LDGEMM_L4x16_BEGIN: | ||||
| @@ -79,9 +218,9 @@ LDGEMM_L4x16_BEGIN: | |||||
| dcbt T3, r0 | dcbt T3, r0 | ||||
| dcbt T4, r0 | dcbt T4, r0 | ||||
| ble LDGEMM_L4x16_SUB0 | |||||
| ble- LDGEMM_L4x16_SUB0 | |||||
| cmpwi cr0, L, 1 | cmpwi cr0, L, 1 | ||||
| ble LDGEMM_L4x16_SUB4 | |||||
| ble- LDGEMM_L4x16_SUB4 | |||||
| .align 4 | .align 4 | ||||
| LDGEMM_L4x16_LOOP_START: | LDGEMM_L4x16_LOOP_START: | ||||
| @@ -97,7 +236,8 @@ LDGEMM_L4x16_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| KERNEL4x16_L2 | KERNEL4x16_L2 | ||||
| ble LDGEMM_L4x16_LOOP_END | |||||
| ble- LDGEMM_L4x16_LOOP_END | |||||
| mtctr L | |||||
| .align 4 | .align 4 | ||||
| @@ -107,10 +247,10 @@ LDGEMM_L4x16_LOOP: | |||||
| dcbt AO, PRE | dcbt AO, PRE | ||||
| KERNEL4x16_L1 | KERNEL4x16_L1 | ||||
| dcbt AO, PRE | dcbt AO, PRE | ||||
| addic. L, L, -1 | |||||
| // addic. L, L, -1 | |||||
| KERNEL4x16_L2 | KERNEL4x16_L2 | ||||
| bgt LDGEMM_L4x16_LOOP | |||||
| bdnz+ LDGEMM_L4x16_LOOP | |||||
| .align 4 | .align 4 | ||||
| @@ -156,7 +296,7 @@ LDGEMM_L4x16_SAVE: | |||||
| SAVE4x16 | SAVE4x16 | ||||
| addic. I, I, -1 | addic. I, I, -1 | ||||
| bgt LDGEMM_L4x16_BEGIN | |||||
| bgt+ LDGEMM_L4x16_BEGIN | |||||
| LDGEMM_L4x16_END: | LDGEMM_L4x16_END: | ||||
| @@ -559,10 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE4x16 | .macro SAVE4x16 | ||||
| mr T1, CO | |||||
| add T2, T1, LDC | |||||
| add T3, T2, LDC | |||||
| add T4, T3, LDC | |||||
| add T2, CO, LDC | |||||
| lxvd2x vs0, 0, CO | lxvd2x vs0, 0, CO | ||||
| lxvd2x vs1, o16, CO | lxvd2x vs1, o16, CO | ||||
| @@ -570,6 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| lxvd2x vs3, o48, CO | lxvd2x vs3, o48, CO | ||||
| lxvd2x vs4, o64, CO | lxvd2x vs4, o64, CO | ||||
| lxvd2x vs5, o80, CO | lxvd2x vs5, o80, CO | ||||
| add T3, T2, LDC | |||||
| lxvd2x vs6, o96, CO | lxvd2x vs6, o96, CO | ||||
| lxvd2x vs7, o112, CO | lxvd2x vs7, o112, CO | ||||
| @@ -579,6 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| lxvd2x vs11, o48, T2 | lxvd2x vs11, o48, T2 | ||||
| lxvd2x vs12, o64, T2 | lxvd2x vs12, o64, T2 | ||||
| lxvd2x vs13, o80, T2 | lxvd2x vs13, o80, T2 | ||||
| add T4, T3, LDC | |||||
| lxvd2x vs14, o96, T2 | lxvd2x vs14, o96, T2 | ||||
| lxvd2x vs15, o112, T2 | lxvd2x vs15, o112, T2 | ||||
| @@ -592,21 +591,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| lxvd2x vs31, o112, T3 | lxvd2x vs31, o112, T3 | ||||
| xvmaddadp vs0, vs32, alpha_r | xvmaddadp vs0, vs32, alpha_r | ||||
| xvmaddadp vs1, vs33, alpha_r | |||||
| xvmaddadp vs2, vs34, alpha_r | |||||
| xvmaddadp vs3, vs35, alpha_r | |||||
| xvmaddadp vs4, vs36, alpha_r | |||||
| xvmaddadp vs5, vs37, alpha_r | |||||
| xvmaddadp vs6, vs38, alpha_r | |||||
| xvmaddadp vs7, vs39, alpha_r | |||||
| lxvd2x vs32, 0, T4 | lxvd2x vs32, 0, T4 | ||||
| xvmaddadp vs1, vs33, alpha_r | |||||
| lxvd2x vs33, o16, T4 | lxvd2x vs33, o16, T4 | ||||
| xvmaddadp vs2, vs34, alpha_r | |||||
| lxvd2x vs34, o32, T4 | lxvd2x vs34, o32, T4 | ||||
| xvmaddadp vs3, vs35, alpha_r | |||||
| lxvd2x vs35, o48, T4 | lxvd2x vs35, o48, T4 | ||||
| xvmaddadp vs4, vs36, alpha_r | |||||
| lxvd2x vs36, o64, T4 | lxvd2x vs36, o64, T4 | ||||
| xvmaddadp vs5, vs37, alpha_r | |||||
| lxvd2x vs37, o80, T4 | lxvd2x vs37, o80, T4 | ||||
| xvmaddadp vs6, vs38, alpha_r | |||||
| lxvd2x vs38, o96, T4 | lxvd2x vs38, o96, T4 | ||||
| xvmaddadp vs7, vs39, alpha_r | |||||
| lxvd2x vs39, o112, T4 | lxvd2x vs39, o112, T4 | ||||
| xvmaddadp vs8, vs40, alpha_r | xvmaddadp vs8, vs40, alpha_r | ||||
| @@ -614,58 +612,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvmaddadp vs10, vs42, alpha_r | xvmaddadp vs10, vs42, alpha_r | ||||
| xvmaddadp vs11, vs43, alpha_r | xvmaddadp vs11, vs43, alpha_r | ||||
| stxvd2x vs0, 0, T1 | |||||
| stxvd2x vs1, o16, T1 | |||||
| stxvd2x vs2, o32, T1 | |||||
| stxvd2x vs3, o48, T1 | |||||
| xvmaddadp vs12, vs44, alpha_r | xvmaddadp vs12, vs44, alpha_r | ||||
| xvmaddadp vs13, vs45, alpha_r | xvmaddadp vs13, vs45, alpha_r | ||||
| xvmaddadp vs14, vs46, alpha_r | xvmaddadp vs14, vs46, alpha_r | ||||
| xvmaddadp vs15, vs47, alpha_r | xvmaddadp vs15, vs47, alpha_r | ||||
| stxvd2x vs4, o64, T1 | |||||
| stxvd2x vs5, o80, T1 | |||||
| stxvd2x vs6, o96, T1 | |||||
| stxvd2x vs7, o112, T1 | |||||
| xvmaddadp vs24, vs48, alpha_r | xvmaddadp vs24, vs48, alpha_r | ||||
| xvmaddadp vs25, vs49, alpha_r | xvmaddadp vs25, vs49, alpha_r | ||||
| xvmaddadp vs26, vs50, alpha_r | xvmaddadp vs26, vs50, alpha_r | ||||
| xvmaddadp vs27, vs51, alpha_r | xvmaddadp vs27, vs51, alpha_r | ||||
| stxvd2x vs8, o0, T2 | |||||
| stxvd2x vs9, o16, T2 | |||||
| stxvd2x vs10, o32, T2 | |||||
| stxvd2x vs11, o48, T2 | |||||
| xvmaddadp vs28, vs52, alpha_r | xvmaddadp vs28, vs52, alpha_r | ||||
| xvmaddadp vs29, vs53, alpha_r | xvmaddadp vs29, vs53, alpha_r | ||||
| xvmaddadp vs30, vs54, alpha_r | xvmaddadp vs30, vs54, alpha_r | ||||
| xvmaddadp vs31, vs55, alpha_r | xvmaddadp vs31, vs55, alpha_r | ||||
| stxvd2x vs12, o64, T2 | |||||
| stxvd2x vs13, o80, T2 | |||||
| stxvd2x vs14, o96, T2 | |||||
| stxvd2x vs15, o112, T2 | |||||
| stxvd2x vs0, 0, CO | |||||
| stxvd2x vs1, o16, CO | |||||
| stxvd2x vs2, o32, CO | |||||
| stxvd2x vs3, o48, CO | |||||
| stxvd2x vs4, o64, CO | |||||
| stxvd2x vs5, o80, CO | |||||
| stxvd2x vs6, o96, CO | |||||
| stxvd2x vs7, o112, CO | |||||
| xvmaddadp vs32, vs56, alpha_r | xvmaddadp vs32, vs56, alpha_r | ||||
| xvmaddadp vs33, vs57, alpha_r | xvmaddadp vs33, vs57, alpha_r | ||||
| xvmaddadp vs34, vs58, alpha_r | xvmaddadp vs34, vs58, alpha_r | ||||
| xvmaddadp vs35, vs59, alpha_r | xvmaddadp vs35, vs59, alpha_r | ||||
| stxvd2x vs24, 0, T3 | |||||
| stxvd2x vs25, o16, T3 | |||||
| stxvd2x vs26, o32, T3 | |||||
| stxvd2x vs27, o48, T3 | |||||
| xvmaddadp vs36, vs60, alpha_r | xvmaddadp vs36, vs60, alpha_r | ||||
| xvmaddadp vs37, vs61, alpha_r | xvmaddadp vs37, vs61, alpha_r | ||||
| xvmaddadp vs38, vs62, alpha_r | xvmaddadp vs38, vs62, alpha_r | ||||
| xvmaddadp vs39, vs63, alpha_r | xvmaddadp vs39, vs63, alpha_r | ||||
| addi CO, CO, 128 | |||||
| stxvd2x vs8, o0, T2 | |||||
| stxvd2x vs9, o16, T2 | |||||
| stxvd2x vs10, o32, T2 | |||||
| stxvd2x vs11, o48, T2 | |||||
| stxvd2x vs12, o64, T2 | |||||
| stxvd2x vs13, o80, T2 | |||||
| stxvd2x vs14, o96, T2 | |||||
| stxvd2x vs15, o112, T2 | |||||
| stxvd2x vs24, 0, T3 | |||||
| stxvd2x vs25, o16, T3 | |||||
| stxvd2x vs28, o64, T3 | stxvd2x vs28, o64, T3 | ||||
| stxvd2x vs29, o80, T3 | stxvd2x vs29, o80, T3 | ||||
| stxvd2x vs26, o32, T3 | |||||
| stxvd2x vs27, o48, T3 | |||||
| stxvd2x vs30, o96, T3 | stxvd2x vs30, o96, T3 | ||||
| stxvd2x vs31, o112, T3 | stxvd2x vs31, o112, T3 | ||||
| @@ -674,8 +674,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stxvd2x vs34, o32, T4 | stxvd2x vs34, o32, T4 | ||||
| stxvd2x vs35, o48, T4 | stxvd2x vs35, o48, T4 | ||||
| addi CO, CO, 128 | |||||
| stxvd2x vs36, o64, T4 | stxvd2x vs36, o64, T4 | ||||
| stxvd2x vs37, o80, T4 | stxvd2x vs37, o80, T4 | ||||
| stxvd2x vs38, o96, T4 | stxvd2x vs38, o96, T4 | ||||
| @@ -1965,8 +1965,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define DNUMOPT 8 | #define DNUMOPT 8 | ||||
| #define GEMM_DEFAULT_OFFSET_A 0 | #define GEMM_DEFAULT_OFFSET_A 0 | ||||
| #define GEMM_DEFAULT_OFFSET_B 4096 | |||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define GEMM_DEFAULT_OFFSET_B 65536 | |||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | #define SGEMM_DEFAULT_UNROLL_M 16 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | #define SGEMM_DEFAULT_UNROLL_N 8 | ||||
| @@ -1983,7 +1983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ZGEMM_DEFAULT_P 320 | #define ZGEMM_DEFAULT_P 320 | ||||
| #define SGEMM_DEFAULT_Q 640 | #define SGEMM_DEFAULT_Q 640 | ||||
| #define DGEMM_DEFAULT_Q 640 | |||||
| #define DGEMM_DEFAULT_Q 720 | |||||
| #define CGEMM_DEFAULT_Q 640 | #define CGEMM_DEFAULT_Q 640 | ||||
| #define ZGEMM_DEFAULT_Q 640 | #define ZGEMM_DEFAULT_Q 640 | ||||