optimized dgemm on power8 for 20 threadstags/v0.2.19^2
| @@ -13,10 +13,10 @@ endif | |||
| ifeq ($(CORE), POWER8) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math | |||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math | |||
| endif | |||
| endif | |||
| @@ -803,7 +803,7 @@ Lmcount$lazy_ptr: | |||
| #elif defined(PPC440FP2) | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #elif defined(POWER8) | |||
| #define BUFFER_SIZE ( 32 << 20) | |||
| #define BUFFER_SIZE ( 64 << 20) | |||
| #else | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #endif | |||
| @@ -39,13 +39,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LDGEMM_L4_BEGIN: | |||
| mr CO, C | |||
| li T1, 128 | |||
| li T2, 256 | |||
| mr AO, A | |||
| slwi T1, LDC , 2 | |||
| add C, C, T1 | |||
| mr CO, C | |||
| slwi T3, LDC , 2 | |||
| add C, C, T3 | |||
| dcbt A, T1 | |||
| dcbt A, T2 | |||
| srawi. I, M, 4 | |||
| ble LDGEMM_L4x16_END | |||
| .align 4 | |||
| LDGEMM_L4x16_BEGIN_FIRST: | |||
| li L, -128 | |||
| mr T1, CO | |||
| add T2, T1, LDC | |||
| add T3, T2, LDC | |||
| add T4, T3, LDC | |||
| and T1, T1, L | |||
| and T2, T2, L | |||
| and T3, T3, L | |||
| and T4, T4, L | |||
| dcbt T1, r0 | |||
| dcbt T2, r0 | |||
| dcbt T3, r0 | |||
| dcbt T4, r0 | |||
| mr BO, B | |||
| srawi. L, K, 2 | |||
| addi T1, T1, 128 | |||
| addi T2, T2, 128 | |||
| addi T3, T3, 128 | |||
| addi T4, T4, 128 | |||
| dcbt T1, r0 | |||
| dcbt T2, r0 | |||
| dcbt T3, r0 | |||
| dcbt T4, r0 | |||
| ble LDGEMM_L4x16_SUB0_FIRST | |||
| cmpwi cr0, L, 1 | |||
| ble LDGEMM_L4x16_SUB4_FIRST | |||
| .align 4 | |||
| LDGEMM_L4x16_LOOP_START_FIRST: | |||
| li T2, 512 | |||
| li o40, 40 | |||
| li o56, 56 | |||
| dcbt AO, PRE | |||
| dcbt BO, T2 | |||
| LOAD4x16_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x16_I1 | |||
| dcbt AO, PRE | |||
| addic. L, L, -2 | |||
| KERNEL4x16_L2 | |||
| dcbt AO, PRE | |||
| KERNEL4x16_L1 | |||
| dcbt AO, PRE | |||
| dcbt BO, T2 | |||
| KERNEL4x16_L2 | |||
| ble LDGEMM_L4x16_LOOP_END_FIRST | |||
| mtctr L | |||
| .align 4 | |||
| LDGEMM_L4x16_LOOP_FIRST: | |||
| dcbt AO, PRE | |||
| KERNEL4x16_L1 | |||
| dcbt AO, PRE | |||
| KERNEL4x16_L2 | |||
| dcbt AO, PRE | |||
| KERNEL4x16_L1 | |||
| dcbt AO, PRE | |||
| dcbt BO, T2 | |||
| KERNEL4x16_L2 | |||
| bdnz LDGEMM_L4x16_LOOP_FIRST | |||
| .align 4 | |||
| LDGEMM_L4x16_LOOP_END_FIRST: | |||
| KERNEL4x16_L1 | |||
| KERNEL4x16_L2 | |||
| KERNEL4x16_1 | |||
| KERNEL4x16_E2 | |||
| b LDGEMM_L4x16_SUB1_FIRST | |||
| LDGEMM_L4x16_SUB4_FIRST: | |||
| KERNEL4x16_SUBI1 | |||
| KERNEL4x16_SUB1 | |||
| KERNEL4x16_SUB1 | |||
| KERNEL4x16_SUB1 | |||
| b LDGEMM_L4x16_SUB1_FIRST | |||
| LDGEMM_L4x16_SUB0_FIRST: | |||
| andi. L, K, 3 | |||
| KERNEL4x16_SUBI1 | |||
| addic. L, L, -1 | |||
| ble LDGEMM_L4x16_SAVE_FIRST | |||
| b LDGEMM_L4x16_SUB2_FIRST | |||
| LDGEMM_L4x16_SUB1_FIRST: | |||
| andi. L, K, 3 | |||
| ble LDGEMM_L4x16_SAVE_FIRST | |||
| LDGEMM_L4x16_SUB2_FIRST: | |||
| KERNEL4x16_SUB1 | |||
| addic. L, L, -1 | |||
| bgt LDGEMM_L4x16_SUB2_FIRST | |||
| .align 4 | |||
| LDGEMM_L4x16_SAVE_FIRST: | |||
| SAVE4x16 | |||
| addic. I, I, -1 | |||
| ble LDGEMM_L4x16_END | |||
| LDGEMM_L4x16_END_FIRST: | |||
| .align 4 | |||
| LDGEMM_L4x16_BEGIN: | |||
| @@ -79,9 +218,9 @@ LDGEMM_L4x16_BEGIN: | |||
| dcbt T3, r0 | |||
| dcbt T4, r0 | |||
| ble LDGEMM_L4x16_SUB0 | |||
| ble- LDGEMM_L4x16_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble LDGEMM_L4x16_SUB4 | |||
| ble- LDGEMM_L4x16_SUB4 | |||
| .align 4 | |||
| LDGEMM_L4x16_LOOP_START: | |||
| @@ -97,7 +236,8 @@ LDGEMM_L4x16_LOOP_START: | |||
| addic. L, L, -2 | |||
| KERNEL4x16_L2 | |||
| ble LDGEMM_L4x16_LOOP_END | |||
| ble- LDGEMM_L4x16_LOOP_END | |||
| mtctr L | |||
| .align 4 | |||
| @@ -107,10 +247,10 @@ LDGEMM_L4x16_LOOP: | |||
| dcbt AO, PRE | |||
| KERNEL4x16_L1 | |||
| dcbt AO, PRE | |||
| addic. L, L, -1 | |||
| // addic. L, L, -1 | |||
| KERNEL4x16_L2 | |||
| bgt LDGEMM_L4x16_LOOP | |||
| bdnz+ LDGEMM_L4x16_LOOP | |||
| .align 4 | |||
| @@ -156,7 +296,7 @@ LDGEMM_L4x16_SAVE: | |||
| SAVE4x16 | |||
| addic. I, I, -1 | |||
| bgt LDGEMM_L4x16_BEGIN | |||
| bgt+ LDGEMM_L4x16_BEGIN | |||
| LDGEMM_L4x16_END: | |||
| @@ -559,10 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE4x16 | |||
| mr T1, CO | |||
| add T2, T1, LDC | |||
| add T3, T2, LDC | |||
| add T4, T3, LDC | |||
| add T2, CO, LDC | |||
| lxvd2x vs0, 0, CO | |||
| lxvd2x vs1, o16, CO | |||
| @@ -570,6 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvd2x vs3, o48, CO | |||
| lxvd2x vs4, o64, CO | |||
| lxvd2x vs5, o80, CO | |||
| add T3, T2, LDC | |||
| lxvd2x vs6, o96, CO | |||
| lxvd2x vs7, o112, CO | |||
| @@ -579,6 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvd2x vs11, o48, T2 | |||
| lxvd2x vs12, o64, T2 | |||
| lxvd2x vs13, o80, T2 | |||
| add T4, T3, LDC | |||
| lxvd2x vs14, o96, T2 | |||
| lxvd2x vs15, o112, T2 | |||
| @@ -592,21 +591,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvd2x vs31, o112, T3 | |||
| xvmaddadp vs0, vs32, alpha_r | |||
| xvmaddadp vs1, vs33, alpha_r | |||
| xvmaddadp vs2, vs34, alpha_r | |||
| xvmaddadp vs3, vs35, alpha_r | |||
| xvmaddadp vs4, vs36, alpha_r | |||
| xvmaddadp vs5, vs37, alpha_r | |||
| xvmaddadp vs6, vs38, alpha_r | |||
| xvmaddadp vs7, vs39, alpha_r | |||
| lxvd2x vs32, 0, T4 | |||
| xvmaddadp vs1, vs33, alpha_r | |||
| lxvd2x vs33, o16, T4 | |||
| xvmaddadp vs2, vs34, alpha_r | |||
| lxvd2x vs34, o32, T4 | |||
| xvmaddadp vs3, vs35, alpha_r | |||
| lxvd2x vs35, o48, T4 | |||
| xvmaddadp vs4, vs36, alpha_r | |||
| lxvd2x vs36, o64, T4 | |||
| xvmaddadp vs5, vs37, alpha_r | |||
| lxvd2x vs37, o80, T4 | |||
| xvmaddadp vs6, vs38, alpha_r | |||
| lxvd2x vs38, o96, T4 | |||
| xvmaddadp vs7, vs39, alpha_r | |||
| lxvd2x vs39, o112, T4 | |||
| xvmaddadp vs8, vs40, alpha_r | |||
| @@ -614,58 +612,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs10, vs42, alpha_r | |||
| xvmaddadp vs11, vs43, alpha_r | |||
| stxvd2x vs0, 0, T1 | |||
| stxvd2x vs1, o16, T1 | |||
| stxvd2x vs2, o32, T1 | |||
| stxvd2x vs3, o48, T1 | |||
| xvmaddadp vs12, vs44, alpha_r | |||
| xvmaddadp vs13, vs45, alpha_r | |||
| xvmaddadp vs14, vs46, alpha_r | |||
| xvmaddadp vs15, vs47, alpha_r | |||
| stxvd2x vs4, o64, T1 | |||
| stxvd2x vs5, o80, T1 | |||
| stxvd2x vs6, o96, T1 | |||
| stxvd2x vs7, o112, T1 | |||
| xvmaddadp vs24, vs48, alpha_r | |||
| xvmaddadp vs25, vs49, alpha_r | |||
| xvmaddadp vs26, vs50, alpha_r | |||
| xvmaddadp vs27, vs51, alpha_r | |||
| stxvd2x vs8, o0, T2 | |||
| stxvd2x vs9, o16, T2 | |||
| stxvd2x vs10, o32, T2 | |||
| stxvd2x vs11, o48, T2 | |||
| xvmaddadp vs28, vs52, alpha_r | |||
| xvmaddadp vs29, vs53, alpha_r | |||
| xvmaddadp vs30, vs54, alpha_r | |||
| xvmaddadp vs31, vs55, alpha_r | |||
| stxvd2x vs12, o64, T2 | |||
| stxvd2x vs13, o80, T2 | |||
| stxvd2x vs14, o96, T2 | |||
| stxvd2x vs15, o112, T2 | |||
| stxvd2x vs0, 0, CO | |||
| stxvd2x vs1, o16, CO | |||
| stxvd2x vs2, o32, CO | |||
| stxvd2x vs3, o48, CO | |||
| stxvd2x vs4, o64, CO | |||
| stxvd2x vs5, o80, CO | |||
| stxvd2x vs6, o96, CO | |||
| stxvd2x vs7, o112, CO | |||
| xvmaddadp vs32, vs56, alpha_r | |||
| xvmaddadp vs33, vs57, alpha_r | |||
| xvmaddadp vs34, vs58, alpha_r | |||
| xvmaddadp vs35, vs59, alpha_r | |||
| stxvd2x vs24, 0, T3 | |||
| stxvd2x vs25, o16, T3 | |||
| stxvd2x vs26, o32, T3 | |||
| stxvd2x vs27, o48, T3 | |||
| xvmaddadp vs36, vs60, alpha_r | |||
| xvmaddadp vs37, vs61, alpha_r | |||
| xvmaddadp vs38, vs62, alpha_r | |||
| xvmaddadp vs39, vs63, alpha_r | |||
| addi CO, CO, 128 | |||
| stxvd2x vs8, o0, T2 | |||
| stxvd2x vs9, o16, T2 | |||
| stxvd2x vs10, o32, T2 | |||
| stxvd2x vs11, o48, T2 | |||
| stxvd2x vs12, o64, T2 | |||
| stxvd2x vs13, o80, T2 | |||
| stxvd2x vs14, o96, T2 | |||
| stxvd2x vs15, o112, T2 | |||
| stxvd2x vs24, 0, T3 | |||
| stxvd2x vs25, o16, T3 | |||
| stxvd2x vs28, o64, T3 | |||
| stxvd2x vs29, o80, T3 | |||
| stxvd2x vs26, o32, T3 | |||
| stxvd2x vs27, o48, T3 | |||
| stxvd2x vs30, o96, T3 | |||
| stxvd2x vs31, o112, T3 | |||
| @@ -674,8 +674,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stxvd2x vs34, o32, T4 | |||
| stxvd2x vs35, o48, T4 | |||
| addi CO, CO, 128 | |||
| stxvd2x vs36, o64, T4 | |||
| stxvd2x vs37, o80, T4 | |||
| stxvd2x vs38, o96, T4 | |||
| @@ -1965,8 +1965,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define DNUMOPT 8 | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 4096 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define GEMM_DEFAULT_OFFSET_B 65536 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| @@ -1983,7 +1983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ZGEMM_DEFAULT_P 320 | |||
| #define SGEMM_DEFAULT_Q 640 | |||
| #define DGEMM_DEFAULT_Q 640 | |||
| #define DGEMM_DEFAULT_Q 720 | |||
| #define CGEMM_DEFAULT_Q 640 | |||
| #define ZGEMM_DEFAULT_Q 640 | |||