From 8b140220c8dd4ac0b93204951486e1ef6d898efa Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 22 May 2016 15:20:04 +0200 Subject: [PATCH] optimized dtrsm_kernel_LT for POWER8 --- kernel/power/dtrsm_kernel_LT_16x4_power8.S | 1 + kernel/power/dtrsm_logic_LT_16x4_power8.S | 46 +++++++++++++++++++++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S index e1c6249f8..fdfc5ac70 100644 --- a/kernel/power/dtrsm_kernel_LT_16x4_power8.S +++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S @@ -219,6 +219,7 @@ li o24, 24 li o32, 32 li o48, 48 + li PRE, 384 mr KK, OFFSET diff --git a/kernel/power/dtrsm_logic_LT_16x4_power8.S b/kernel/power/dtrsm_logic_LT_16x4_power8.S index d5d34b422..540a64062 100644 --- a/kernel/power/dtrsm_logic_LT_16x4_power8.S +++ b/kernel/power/dtrsm_logic_LT_16x4_power8.S @@ -18,6 +18,33 @@ DSTRM_LT_L4x16_BEGIN: mr BO, B + li L, -128 + + mr T1, CO + add T2, T1, LDC + add T3, T2, LDC + add T4, T3, LDC + + and T1, T1, L + and T2, T2, L + and T3, T3, L + and T4, T4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + DSTRM_LT_L4x16_LOOP_START: @@ -26,15 +53,30 @@ DSTRM_LT_L4x16_LOOP_START: addic. L, KK, 0 - ble DSTRM_LT_L4x16_SAVE + ble- DSTRM_LT_L4x16_SAVE DSTRM_LT_L4x16_LOOP: + dcbt AO, PRE + dcbt BO, PRE + KERNEL_16x4 + addic. L, L, -1 + ble- DSTRM_LT_L4x16_SAVE + + dcbt AO, PRE + KERNEL_16x4 + addic. L, L, -1 + ble- DSTRM_LT_L4x16_SAVE + dcbt AO, PRE KERNEL_16x4 + addic. L, L, -1 + ble- DSTRM_LT_L4x16_SAVE + dcbt AO, PRE + KERNEL_16x4 addic. L, L, -1 - bgt DSTRM_LT_L4x16_LOOP + bgt+ DSTRM_LT_L4x16_LOOP DSTRM_LT_L4x16_SAVE: