| @@ -21,7 +21,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| DGEMMKERNEL = dgemm_kernel_16x4_power8.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| @@ -134,13 +134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define T4 r12 | |||
| #define T3 r11 | |||
| #define o40 r12 | |||
| #define o56 r11 | |||
| #define o112 r14 | |||
| #define o8 r15 | |||
| #define o24 r16 | |||
| #define ALPHA r17 | |||
| #define o64 r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define KK r20 | |||
| #define BB r21 | |||
| #define o80 r20 | |||
| #define o96 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| @@ -205,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| #else | |||
| stw r31, 144(SP) | |||
| stw r30, 148(SP) | |||
| @@ -223,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stw r17, 200(SP) | |||
| stw r16, 204(SP) | |||
| stw r15, 208(SP) | |||
| stw r14, 212(SP) | |||
| #endif | |||
| stfd f1, ALPHA_SP | |||
| @@ -263,9 +269,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ble .L999_H1 | |||
| #ifdef __64BIT__ | |||
| addi ALPHA, SP, 296 | |||
| addi T1, SP, 296 | |||
| #else | |||
| addi ALPHA, SP, 224 | |||
| addi T1, SP, 224 | |||
| #endif | |||
| li PRE, 384 | |||
| @@ -274,8 +280,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| li o24, 24 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| li o64, 64 | |||
| li o80, 80 | |||
| li o96, 96 | |||
| li o112, 112 | |||
| lxvdsx alpha_r, 0, ALPHA | |||
| lxvdsx alpha_r, 0, T1 | |||
| #include "dgemm_logic_16x4_power8.S" | |||
| @@ -323,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| #else | |||
| lwz r31, 144(SP) | |||
| lwz r30, 148(SP) | |||
| @@ -341,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lwz r17, 200(SP) | |||
| lwz r16, 204(SP) | |||
| lwz r15, 208(SP) | |||
| lwz r14, 212(SP) | |||
| #endif | |||
| addi SP, SP, STACKSIZE | |||
| @@ -46,23 +46,28 @@ LDGEMM_L4_BEGIN: | |||
| srawi. I, M, 4 | |||
| ble LDGEMM_L4x16_END | |||
| .align 5 | |||
| .align 4 | |||
| LDGEMM_L4x16_BEGIN: | |||
| li T4, -128 | |||
| li L, -128 | |||
| and T1, CO, T4 | |||
| mr T1, CO | |||
| add T2, T1, LDC | |||
| add T3, T2, LDC | |||
| add T4, T3, LDC | |||
| and T1, T1, L | |||
| and T2, T2, L | |||
| and T3, T3, L | |||
| and T4, T4, L | |||
| dcbt T1, r0 | |||
| dcbt T2, r0 | |||
| dcbt T3, r0 | |||
| dcbt T4, r0 | |||
| andi. cr0, CO, 127 | |||
| ble LDGEMM_L4x16_BEGIN_NOPRE | |||
| mr BO, B | |||
| srawi. L, K, 1 | |||
| addi T1, T1, 128 | |||
| addi T2, T2, 128 | |||
| @@ -74,55 +79,43 @@ LDGEMM_L4x16_BEGIN: | |||
| dcbt T3, r0 | |||
| dcbt T4, r0 | |||
| LDGEMM_L4x16_BEGIN_NOPRE: | |||
| mr BO, B | |||
| srawi. L, K, 2 | |||
| ble LDGEMM_L4x16_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble LDGEMM_L4x16_SUB4 | |||
| .align 5 | |||
| .align 4 | |||
| LDGEMM_L4x16_LOOP_START: | |||
| li o40, 40 | |||
| li o56, 56 | |||
| dcbt AO, PRE | |||
| LOAD4x16_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x16_I1 | |||
| dcbt AO, PRE | |||
| KERNEL4x16_2 | |||
| dcbt AO, PRE | |||
| KERNEL4x16_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x16_2 | |||
| addic. L, L, -2 | |||
| KERNEL4x16_L2 | |||
| ble LDGEMM_L4x16_LOOP_END | |||
| .align 7 | |||
| .align 4 | |||
| LDGEMM_L4x16_LOOP: | |||
| dcbt AO, PRE | |||
| KERNEL4x16_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x16_2 | |||
| dcbt AO, PRE | |||
| KERNEL4x16_1 | |||
| KERNEL4x16_L1 | |||
| dcbt AO, PRE | |||
| KERNEL4x16_2 | |||
| addic. L, L, -1 | |||
| KERNEL4x16_L2 | |||
| bgt LDGEMM_L4x16_LOOP | |||
| .align 5 | |||
| .align 4 | |||
| LDGEMM_L4x16_LOOP_END: | |||
| dcbt AO, PRE | |||
| KERNEL4x16_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x16_2 | |||
| KERNEL4x16_1 | |||
| KERNEL4x16_E2 | |||
| @@ -132,14 +125,12 @@ LDGEMM_L4x16_SUB4: | |||
| KERNEL4x16_SUBI1 | |||
| KERNEL4x16_SUB1 | |||
| KERNEL4x16_SUB1 | |||
| KERNEL4x16_SUB1 | |||
| b LDGEMM_L4x16_SUB1 | |||
| LDGEMM_L4x16_SUB0: | |||
| andi. L, K, 3 | |||
| andi. L, K, 1 | |||
| KERNEL4x16_SUBI1 | |||
| @@ -149,7 +140,7 @@ LDGEMM_L4x16_SUB0: | |||
| LDGEMM_L4x16_SUB1: | |||
| andi. L, K, 3 | |||
| andi. L, K, 1 | |||
| ble LDGEMM_L4x16_SAVE | |||
| LDGEMM_L4x16_SUB2: | |||
| @@ -159,7 +150,7 @@ LDGEMM_L4x16_SUB2: | |||
| addic. L, L, -1 | |||
| bgt LDGEMM_L4x16_SUB2 | |||
| .align 5 | |||
| .align 4 | |||
| LDGEMM_L4x16_SAVE: | |||
| SAVE4x16 | |||
| @@ -184,15 +175,20 @@ LDGEMM_L4x8_BEGIN: | |||
| LDGEMM_L4x8_LOOP_START: | |||
| dcbt AO, PRE | |||
| LOAD4x8_1 | |||
| KERNEL4x8_I1 | |||
| dcbt AO, PRE | |||
| KERNEL4x8_2 | |||
| KERNEL4x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x8_2 | |||
| KERNEL4x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x8_2 | |||
| KERNEL4x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x8_2 | |||
| addic. L, L, -2 | |||
| @@ -203,13 +199,17 @@ LDGEMM_L4x8_LOOP_START: | |||
| LDGEMM_L4x8_LOOP: | |||
| KERNEL4x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x8_2 | |||
| KERNEL4x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x8_2 | |||
| KERNEL4x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x8_2 | |||
| KERNEL4x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x8_2 | |||
| addic. L, L, -1 | |||
| @@ -284,15 +284,18 @@ LDGEMM_L4x4_BEGIN: | |||
| LDGEMM_L4x4_LOOP_START: | |||
| dcbt AO, PRE | |||
| LOAD4x4_1 | |||
| KERNEL4x4_I1 | |||
| KERNEL4x4_2 | |||
| KERNEL4x4_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x4_2 | |||
| KERNEL4x4_1 | |||
| KERNEL4x4_2 | |||
| KERNEL4x4_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x4_2 | |||
| addic. L, L, -2 | |||
| @@ -305,11 +308,13 @@ LDGEMM_L4x4_LOOP: | |||
| KERNEL4x4_1 | |||
| KERNEL4x4_2 | |||
| KERNEL4x4_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x4_2 | |||
| KERNEL4x4_1 | |||
| KERNEL4x4_2 | |||
| KERNEL4x4_1 | |||
| dcbt AO, PRE | |||
| KERNEL4x4_2 | |||
| addic. L, L, -1 | |||
| @@ -743,15 +748,20 @@ LDGEMM_L2x8_BEGIN: | |||
| LDGEMM_L2x8_LOOP_START: | |||
| dcbt AO, PRE | |||
| LOAD2x8_1 | |||
| KERNEL2x8_I1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| addic. L, L, -2 | |||
| @@ -762,13 +772,17 @@ LDGEMM_L2x8_LOOP_START: | |||
| LDGEMM_L2x8_LOOP: | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_2 | |||
| addic. L, L, -1 | |||
| @@ -1287,15 +1301,20 @@ LDGEMM_L1x8_BEGIN: | |||
| LDGEMM_L1x8_LOOP_START: | |||
| dcbt AO, PRE | |||
| LOAD1x8_1 | |||
| KERNEL1x8_I1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| addic. L, L, -2 | |||
| @@ -1306,13 +1325,17 @@ LDGEMM_L1x8_LOOP_START: | |||
| LDGEMM_L1x8_LOOP: | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| KERNEL1x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL1x8_2 | |||
| addic. L, L, -1 | |||
| @@ -47,88 +47,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvdsx vs24, 0, BO | |||
| lxvdsx vs25, o8, BO | |||
| addi AO, AO, 64 | |||
| lxvd2x vs4, 0, AO | |||
| lxvd2x vs5, o16, AO | |||
| lxvd2x vs6, o32, AO | |||
| lxvd2x vs7, o48, AO | |||
| lxvd2x vs4, o64, AO | |||
| lxvd2x vs5, o80, AO | |||
| lxvd2x vs6, o96, AO | |||
| lxvd2x vs7, o112, AO | |||
| lxvdsx vs26, o16, BO | |||
| lxvdsx vs27, o24, BO | |||
| addi AO, AO, 64 | |||
| addi AO, AO, 128 | |||
| addi BO, BO, 32 | |||
| .endm | |||
| .macro KERNEL4x16_I1 | |||
| xvmuldp vs32, vs0, vs24 | |||
| xvmuldp vs33, vs1, vs24 | |||
| xvmuldp vs34, vs2, vs24 | |||
| xvmuldp vs35, vs3, vs24 | |||
| xvmuldp vs32, vs0, vs24 | |||
| xvmuldp vs33, vs1, vs24 | |||
| xvmuldp vs34, vs2, vs24 | |||
| xvmuldp vs35, vs3, vs24 | |||
| lxvd2x vs8, 0, AO | |||
| lxvd2x vs8, o0, AO | |||
| lxvd2x vs9, o16, AO | |||
| lxvd2x vs10, o32, AO | |||
| lxvd2x vs11, o48, AO | |||
| xvmuldp vs36, vs4, vs24 | |||
| xvmuldp vs37, vs5, vs24 | |||
| xvmuldp vs38, vs6, vs24 | |||
| xvmuldp vs39, vs7, vs24 | |||
| xvmuldp vs36, vs4, vs24 | |||
| xvmuldp vs37, vs5, vs24 | |||
| xvmuldp vs38, vs6, vs24 | |||
| xvmuldp vs39, vs7, vs24 | |||
| lxvdsx vs28, 0, BO | |||
| lxvdsx vs29, o8, BO | |||
| xvmuldp vs40, vs0, vs25 | |||
| xvmuldp vs41, vs1, vs25 | |||
| xvmuldp vs42, vs2, vs25 | |||
| xvmuldp vs43, vs3, vs25 | |||
| xvmuldp vs40, vs0, vs25 | |||
| xvmuldp vs41, vs1, vs25 | |||
| xvmuldp vs42, vs2, vs25 | |||
| xvmuldp vs43, vs3, vs25 | |||
| lxvd2x vs10, o32, AO | |||
| lxvd2x vs11, o48, AO | |||
| xvmuldp vs44, vs4, vs25 | |||
| xvmuldp vs45, vs5, vs25 | |||
| xvmuldp vs46, vs6, vs25 | |||
| xvmuldp vs47, vs7, vs25 | |||
| xvmuldp vs44, vs4, vs25 | |||
| xvmuldp vs45, vs5, vs25 | |||
| xvmuldp vs46, vs6, vs25 | |||
| xvmuldp vs47, vs7, vs25 | |||
| addi AO, AO, 64 | |||
| xvmuldp vs48, vs0, vs26 | |||
| xvmuldp vs49, vs1, vs26 | |||
| xvmuldp vs50, vs2, vs26 | |||
| xvmuldp vs51, vs3, vs26 | |||
| xvmuldp vs48, vs0, vs26 | |||
| xvmuldp vs49, vs1, vs26 | |||
| xvmuldp vs50, vs2, vs26 | |||
| xvmuldp vs51, vs3, vs26 | |||
| lxvd2x vs12, 0, AO | |||
| lxvd2x vs13, o16, AO | |||
| lxvd2x vs12, o64, AO | |||
| lxvd2x vs13, o80, AO | |||
| xvmuldp vs52, vs4, vs26 | |||
| xvmuldp vs53, vs5, vs26 | |||
| xvmuldp vs54, vs6, vs26 | |||
| xvmuldp vs55, vs7, vs26 | |||
| xvmuldp vs52, vs4, vs26 | |||
| xvmuldp vs53, vs5, vs26 | |||
| xvmuldp vs54, vs6, vs26 | |||
| xvmuldp vs55, vs7, vs26 | |||
| lxvd2x vs14, o32, AO | |||
| lxvd2x vs15, o48, AO | |||
| lxvd2x vs14, o96, AO | |||
| lxvd2x vs15, o112, AO | |||
| xvmuldp vs56, vs0, vs27 | |||
| xvmuldp vs57, vs1, vs27 | |||
| xvmuldp vs58, vs2, vs27 | |||
| xvmuldp vs59, vs3, vs27 | |||
| xvmuldp vs56, vs0, vs27 | |||
| xvmuldp vs57, vs1, vs27 | |||
| xvmuldp vs58, vs2, vs27 | |||
| xvmuldp vs59, vs3, vs27 | |||
| lxvdsx vs30, o16, BO | |||
| lxvdsx vs31, o24, BO | |||
| xvmuldp vs60, vs4, vs27 | |||
| xvmuldp vs61, vs5, vs27 | |||
| xvmuldp vs62, vs6, vs27 | |||
| xvmuldp vs63, vs7, vs27 | |||
| xvmuldp vs60, vs4, vs27 | |||
| xvmuldp vs61, vs5, vs27 | |||
| xvmuldp vs62, vs6, vs27 | |||
| xvmuldp vs63, vs7, vs27 | |||
| addi AO, AO, 64 | |||
| addi BO, BO, 32 | |||
| addi AO, AO, 128 | |||
| .endm | |||
| .macro KERNEL4x16_1 | |||
| xvmaddadp vs32, vs0, vs24 | |||
| @@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs34, vs2, vs24 | |||
| xvmaddadp vs35, vs3, vs24 | |||
| lxvd2x vs8, 0, AO | |||
| lxvd2x vs8, o0, AO | |||
| lxvd2x vs9, o16, AO | |||
| lxvd2x vs10, o32, AO | |||
| lxvd2x vs11, o48, AO | |||
| xvmaddadp vs36, vs4, vs24 | |||
| xvmaddadp vs37, vs5, vs24 | |||
| @@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs42, vs2, vs25 | |||
| xvmaddadp vs43, vs3, vs25 | |||
| lxvd2x vs10, o32, AO | |||
| lxvd2x vs11, o48, AO | |||
| xvmaddadp vs44, vs4, vs25 | |||
| xvmaddadp vs45, vs5, vs25 | |||
| xvmaddadp vs46, vs6, vs25 | |||
| xvmaddadp vs47, vs7, vs25 | |||
| addi AO, AO, 64 | |||
| xvmaddadp vs48, vs0, vs26 | |||
| xvmaddadp vs49, vs1, vs26 | |||
| xvmaddadp vs50, vs2, vs26 | |||
| xvmaddadp vs51, vs3, vs26 | |||
| lxvd2x vs12, 0, AO | |||
| lxvd2x vs13, o16, AO | |||
| lxvd2x vs12, o64, AO | |||
| lxvd2x vs13, o80, AO | |||
| xvmaddadp vs52, vs4, vs26 | |||
| xvmaddadp vs53, vs5, vs26 | |||
| xvmaddadp vs54, vs6, vs26 | |||
| xvmaddadp vs55, vs7, vs26 | |||
| lxvd2x vs14, o32, AO | |||
| lxvd2x vs15, o48, AO | |||
| lxvd2x vs14, o96, AO | |||
| lxvd2x vs15, o112, AO | |||
| xvmaddadp vs56, vs0, vs27 | |||
| xvmaddadp vs57, vs1, vs27 | |||
| @@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs62, vs6, vs27 | |||
| xvmaddadp vs63, vs7, vs27 | |||
| addi AO, AO, 64 | |||
| addi AO, AO, 128 | |||
| addi BO, BO, 32 | |||
| .endm | |||
| @@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs46, vs14, vs29 | |||
| xvmaddadp vs47, vs15, vs29 | |||
| addi AO, AO, 64 | |||
| xvmaddadp vs48, vs8, vs30 | |||
| xvmaddadp vs49, vs9, vs30 | |||
| xvmaddadp vs50, vs10, vs30 | |||
| xvmaddadp vs51, vs11, vs30 | |||
| lxvd2x vs4, 0, AO | |||
| lxvd2x vs5, o16, AO | |||
| lxvd2x vs4, o64, AO | |||
| lxvd2x vs5, o80, AO | |||
| xvmaddadp vs52, vs12, vs30 | |||
| xvmaddadp vs53, vs13, vs30 | |||
| xvmaddadp vs54, vs14, vs30 | |||
| xvmaddadp vs55, vs15, vs30 | |||
| lxvd2x vs6, o32, AO | |||
| lxvd2x vs7, o48, AO | |||
| lxvd2x vs6, o96, AO | |||
| lxvd2x vs7, o112, AO | |||
| xvmaddadp vs56, vs8, vs31 | |||
| xvmaddadp vs57, vs9, vs31 | |||
| @@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs62, vs14, vs31 | |||
| xvmaddadp vs63, vs15, vs31 | |||
| addi AO, AO, 64 | |||
| addi AO, AO, 128 | |||
| addi BO, BO, 32 | |||
| .endm | |||
| .macro KERNEL4x16_L1 | |||
| xvmaddadp vs32, vs0, vs24 | |||
| xvmaddadp vs33, vs1, vs24 | |||
| xvmaddadp vs34, vs2, vs24 | |||
| xvmaddadp vs35, vs3, vs24 | |||
| lxvd2x vs8, o0, AO | |||
| lxvd2x vs9, o16, AO | |||
| lxvd2x vs10, o32, AO | |||
| lxvd2x vs11, o48, AO | |||
| xvmaddadp vs36, vs4, vs24 | |||
| xvmaddadp vs37, vs5, vs24 | |||
| xvmaddadp vs38, vs6, vs24 | |||
| xvmaddadp vs39, vs7, vs24 | |||
| lxvdsx vs28, 0, BO | |||
| lxvdsx vs29, o8, BO | |||
| xvmaddadp vs40, vs0, vs25 | |||
| xvmaddadp vs41, vs1, vs25 | |||
| xvmaddadp vs42, vs2, vs25 | |||
| xvmaddadp vs43, vs3, vs25 | |||
| xvmaddadp vs44, vs4, vs25 | |||
| xvmaddadp vs45, vs5, vs25 | |||
| xvmaddadp vs46, vs6, vs25 | |||
| xvmaddadp vs47, vs7, vs25 | |||
| xvmaddadp vs48, vs0, vs26 | |||
| xvmaddadp vs49, vs1, vs26 | |||
| xvmaddadp vs50, vs2, vs26 | |||
| xvmaddadp vs51, vs3, vs26 | |||
| lxvd2x vs12, o64, AO | |||
| lxvd2x vs13, o80, AO | |||
| xvmaddadp vs52, vs4, vs26 | |||
| xvmaddadp vs53, vs5, vs26 | |||
| xvmaddadp vs54, vs6, vs26 | |||
| xvmaddadp vs55, vs7, vs26 | |||
| lxvd2x vs14, o96, AO | |||
| lxvd2x vs15, o112, AO | |||
| xvmaddadp vs56, vs0, vs27 | |||
| xvmaddadp vs57, vs1, vs27 | |||
| xvmaddadp vs58, vs2, vs27 | |||
| xvmaddadp vs59, vs3, vs27 | |||
| lxvdsx vs30, o16, BO | |||
| lxvdsx vs31, o24, BO | |||
| xvmaddadp vs60, vs4, vs27 | |||
| xvmaddadp vs61, vs5, vs27 | |||
| xvmaddadp vs62, vs6, vs27 | |||
| xvmaddadp vs63, vs7, vs27 | |||
| addi AO, AO, 128 | |||
| .endm | |||
| .macro KERNEL4x16_L2 | |||
| xvmaddadp vs32, vs8, vs28 | |||
| xvmaddadp vs33, vs9, vs28 | |||
| xvmaddadp vs34, vs10, vs28 | |||
| xvmaddadp vs35, vs11, vs28 | |||
| lxvd2x vs0, 0, AO | |||
| lxvd2x vs1, o16, AO | |||
| xvmaddadp vs36, vs12, vs28 | |||
| xvmaddadp vs37, vs13, vs28 | |||
| xvmaddadp vs38, vs14, vs28 | |||
| xvmaddadp vs39, vs15, vs28 | |||
| lxvdsx vs24, o32, BO | |||
| lxvdsx vs25, o40, BO | |||
| xvmaddadp vs40, vs8, vs29 | |||
| xvmaddadp vs41, vs9, vs29 | |||
| xvmaddadp vs42, vs10, vs29 | |||
| xvmaddadp vs43, vs11, vs29 | |||
| lxvd2x vs2, o32, AO | |||
| lxvd2x vs3, o48, AO | |||
| xvmaddadp vs44, vs12, vs29 | |||
| xvmaddadp vs45, vs13, vs29 | |||
| xvmaddadp vs46, vs14, vs29 | |||
| xvmaddadp vs47, vs15, vs29 | |||
| xvmaddadp vs48, vs8, vs30 | |||
| xvmaddadp vs49, vs9, vs30 | |||
| xvmaddadp vs50, vs10, vs30 | |||
| xvmaddadp vs51, vs11, vs30 | |||
| lxvd2x vs4, o64, AO | |||
| lxvd2x vs5, o80, AO | |||
| xvmaddadp vs52, vs12, vs30 | |||
| xvmaddadp vs53, vs13, vs30 | |||
| xvmaddadp vs54, vs14, vs30 | |||
| xvmaddadp vs55, vs15, vs30 | |||
| lxvd2x vs6, o96, AO | |||
| lxvd2x vs7, o112, AO | |||
| xvmaddadp vs56, vs8, vs31 | |||
| xvmaddadp vs57, vs9, vs31 | |||
| xvmaddadp vs58, vs10, vs31 | |||
| xvmaddadp vs59, vs11, vs31 | |||
| lxvdsx vs26, o48, BO | |||
| lxvdsx vs27, o56, BO | |||
| xvmaddadp vs60, vs12, vs31 | |||
| addi AO, AO, 128 | |||
| xvmaddadp vs61, vs13, vs31 | |||
| xvmaddadp vs62, vs14, vs31 | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs63, vs15, vs31 | |||
| .endm | |||
| .macro KERNEL4x16_E2 | |||
| @@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvdsx vs26, o16, BO | |||
| lxvdsx vs27, o24, BO | |||
| addi AO, AO, 64 | |||
| addi BO, BO, 32 | |||
| lxvd2x vs4, 0, AO | |||
| lxvd2x vs5, o16, AO | |||
| lxvd2x vs6, o32, AO | |||
| lxvd2x vs7, o48, AO | |||
| lxvd2x vs4, o64, AO | |||
| lxvd2x vs5, o80, AO | |||
| lxvd2x vs6, o96, AO | |||
| lxvd2x vs7, o112, AO | |||
| addi AO, AO, 64 | |||
| xvmaddadp vs32, vs0, vs24 | |||
| @@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs41, vs1, vs25 | |||
| xvmaddadp vs42, vs2, vs25 | |||
| xvmaddadp vs43, vs3, vs25 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs44, vs4, vs25 | |||
| xvmaddadp vs45, vs5, vs25 | |||
| xvmaddadp vs46, vs6, vs25 | |||
| @@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs49, vs1, vs26 | |||
| xvmaddadp vs50, vs2, vs26 | |||
| xvmaddadp vs51, vs3, vs26 | |||
| addi AO, AO, 128 | |||
| xvmaddadp vs52, vs4, vs26 | |||
| xvmaddadp vs53, vs5, vs26 | |||
| xvmaddadp vs54, vs6, vs26 | |||
| @@ -430,33 +560,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE4x16 | |||
| mr T1, CO | |||
| addi T2, T1, 64 | |||
| add T3, T1, LDC | |||
| addi T4, T3, 64 | |||
| #ifndef TRMMKERNEL | |||
| lxvd2x vs0, 0, T1 | |||
| lxvd2x vs1, o16, T1 | |||
| lxvd2x vs2, o32, T1 | |||
| lxvd2x vs3, o48, T1 | |||
| lxvd2x vs4, 0, T2 | |||
| lxvd2x vs5, o16, T2 | |||
| lxvd2x vs6, o32, T2 | |||
| lxvd2x vs7, o48, T2 | |||
| lxvd2x vs8, 0, T3 | |||
| lxvd2x vs9, o16, T3 | |||
| lxvd2x vs10, o32, T3 | |||
| lxvd2x vs11, o48, T3 | |||
| lxvd2x vs12, 0, T4 | |||
| lxvd2x vs13, o16, T4 | |||
| lxvd2x vs14, o32, T4 | |||
| lxvd2x vs15, o48, T4 | |||
| #endif | |||
| add T2, T1, LDC | |||
| add T3, T2, LDC | |||
| add T4, T3, LDC | |||
| lxvd2x vs0, 0, CO | |||
| lxvd2x vs1, o16, CO | |||
| lxvd2x vs2, o32, CO | |||
| lxvd2x vs3, o48, CO | |||
| lxvd2x vs4, o64, CO | |||
| lxvd2x vs5, o80, CO | |||
| lxvd2x vs6, o96, CO | |||
| lxvd2x vs7, o112, CO | |||
| lxvd2x vs8, 0, T2 | |||
| lxvd2x vs9, o16, T2 | |||
| lxvd2x vs10, o32, T2 | |||
| lxvd2x vs11, o48, T2 | |||
| lxvd2x vs12, o64, T2 | |||
| lxvd2x vs13, o80, T2 | |||
| lxvd2x vs14, o96, T2 | |||
| lxvd2x vs15, o112, T2 | |||
| lxvd2x vs24, 0, T3 | |||
| lxvd2x vs25, o16, T3 | |||
| lxvd2x vs26, o32, T3 | |||
| lxvd2x vs27, o48, T3 | |||
| lxvd2x vs28, o64, T3 | |||
| lxvd2x vs29, o80, T3 | |||
| lxvd2x vs30, o96, T3 | |||
| lxvd2x vs31, o112, T3 | |||
| #ifndef TRMMKERNEL | |||
| xvmaddadp vs0, vs32, alpha_r | |||
| xvmaddadp vs1, vs33, alpha_r | |||
| xvmaddadp vs2, vs34, alpha_r | |||
| @@ -465,138 +599,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs5, vs37, alpha_r | |||
| xvmaddadp vs6, vs38, alpha_r | |||
| xvmaddadp vs7, vs39, alpha_r | |||
| lxvd2x vs32, 0, T4 | |||
| lxvd2x vs33, o16, T4 | |||
| lxvd2x vs34, o32, T4 | |||
| lxvd2x vs35, o48, T4 | |||
| lxvd2x vs36, o64, T4 | |||
| lxvd2x vs37, o80, T4 | |||
| lxvd2x vs38, o96, T4 | |||
| lxvd2x vs39, o112, T4 | |||
| xvmaddadp vs8, vs40, alpha_r | |||
| xvmaddadp vs9, vs41, alpha_r | |||
| xvmaddadp vs10, vs42, alpha_r | |||
| xvmaddadp vs11, vs43, alpha_r | |||
| xvmaddadp vs12, vs44, alpha_r | |||
| xvmaddadp vs13, vs45, alpha_r | |||
| xvmaddadp vs14, vs46, alpha_r | |||
| xvmaddadp vs15, vs47, alpha_r | |||
| #else | |||
| xvmuldp vs0, vs32, alpha_r | |||
| xvmuldp vs1, vs33, alpha_r | |||
| xvmuldp vs2, vs34, alpha_r | |||
| xvmuldp vs3, vs35, alpha_r | |||
| xvmuldp vs4, vs36, alpha_r | |||
| xvmuldp vs5, vs37, alpha_r | |||
| xvmuldp vs6, vs38, alpha_r | |||
| xvmuldp vs7, vs39, alpha_r | |||
| xvmuldp vs8, vs40, alpha_r | |||
| xvmuldp vs9, vs41, alpha_r | |||
| xvmuldp vs10, vs42, alpha_r | |||
| xvmuldp vs11, vs43, alpha_r | |||
| xvmuldp vs12, vs44, alpha_r | |||
| xvmuldp vs13, vs45, alpha_r | |||
| xvmuldp vs14, vs46, alpha_r | |||
| xvmuldp vs15, vs47, alpha_r | |||
| #endif | |||
| stxvd2x vs0, 0, T1 | |||
| stxvd2x vs1, o16, T1 | |||
| stxvd2x vs2, o32, T1 | |||
| stxvd2x vs3, o48, T1 | |||
| stxvd2x vs4, 0, T2 | |||
| stxvd2x vs5, o16, T2 | |||
| stxvd2x vs6, o32, T2 | |||
| stxvd2x vs7, o48, T2 | |||
| stxvd2x vs8, 0, T3 | |||
| stxvd2x vs9, o16, T3 | |||
| stxvd2x vs10, o32, T3 | |||
| stxvd2x vs11, o48, T3 | |||
| stxvd2x vs12, 0, T4 | |||
| stxvd2x vs13, o16, T4 | |||
| stxvd2x vs14, o32, T4 | |||
| stxvd2x vs15, o48, T4 | |||
| slwi T4, LDC, 1 | |||
| add T1, T1, T4 | |||
| add T3, T3, T4 | |||
| addi T2, T1, 64 | |||
| addi T4, T3, 64 | |||
| #ifndef TRMMKERNEL | |||
| lxvd2x vs0, 0, T1 | |||
| lxvd2x vs1, o16, T1 | |||
| lxvd2x vs2, o32, T1 | |||
| lxvd2x vs3, o48, T1 | |||
| lxvd2x vs4, 0, T2 | |||
| lxvd2x vs5, o16, T2 | |||
| lxvd2x vs6, o32, T2 | |||
| lxvd2x vs7, o48, T2 | |||
| lxvd2x vs8, 0, T3 | |||
| lxvd2x vs9, o16, T3 | |||
| lxvd2x vs10, o32, T3 | |||
| lxvd2x vs11, o48, T3 | |||
| lxvd2x vs12, 0, T4 | |||
| lxvd2x vs13, o16, T4 | |||
| lxvd2x vs14, o32, T4 | |||
| lxvd2x vs15, o48, T4 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| xvmaddadp vs0, vs48, alpha_r | |||
| xvmaddadp vs1, vs49, alpha_r | |||
| xvmaddadp vs2, vs50, alpha_r | |||
| xvmaddadp vs3, vs51, alpha_r | |||
| xvmaddadp vs4, vs52, alpha_r | |||
| xvmaddadp vs5, vs53, alpha_r | |||
| xvmaddadp vs6, vs54, alpha_r | |||
| xvmaddadp vs7, vs55, alpha_r | |||
| xvmaddadp vs8, vs56, alpha_r | |||
| xvmaddadp vs9, vs57, alpha_r | |||
| xvmaddadp vs10, vs58, alpha_r | |||
| xvmaddadp vs11, vs59, alpha_r | |||
| xvmaddadp vs12, vs60, alpha_r | |||
| xvmaddadp vs13, vs61, alpha_r | |||
| xvmaddadp vs14, vs62, alpha_r | |||
| xvmaddadp vs15, vs63, alpha_r | |||
| #else | |||
| xvmuldp vs0, vs48, alpha_r | |||
| xvmuldp vs1, vs49, alpha_r | |||
| xvmuldp vs2, vs50, alpha_r | |||
| xvmuldp vs3, vs51, alpha_r | |||
| xvmuldp vs4, vs52, alpha_r | |||
| xvmuldp vs5, vs53, alpha_r | |||
| xvmuldp vs6, vs54, alpha_r | |||
| xvmuldp vs7, vs55, alpha_r | |||
| xvmuldp vs8, vs56, alpha_r | |||
| xvmuldp vs9, vs57, alpha_r | |||
| xvmuldp vs10, vs58, alpha_r | |||
| xvmuldp vs11, vs59, alpha_r | |||
| xvmuldp vs12, vs60, alpha_r | |||
| xvmuldp vs13, vs61, alpha_r | |||
| xvmuldp vs14, vs62, alpha_r | |||
| xvmuldp vs15, vs63, alpha_r | |||
| #endif | |||
| stxvd2x vs0, 0, T1 | |||
| stxvd2x vs1, o16, T1 | |||
| stxvd2x vs2, o32, T1 | |||
| stxvd2x vs3, o48, T1 | |||
| xvmaddadp vs12, vs44, alpha_r | |||
| xvmaddadp vs13, vs45, alpha_r | |||
| xvmaddadp vs14, vs46, alpha_r | |||
| xvmaddadp vs15, vs47, alpha_r | |||
| stxvd2x vs4, 0, T2 | |||
| stxvd2x vs5, o16, T2 | |||
| stxvd2x vs6, o32, T2 | |||
| stxvd2x vs7, o48, T2 | |||
| stxvd2x vs4, o64, T1 | |||
| stxvd2x vs5, o80, T1 | |||
| stxvd2x vs6, o96, T1 | |||
| stxvd2x vs7, o112, T1 | |||
| xvmaddadp vs24, vs48, alpha_r | |||
| xvmaddadp vs25, vs49, alpha_r | |||
| xvmaddadp vs26, vs50, alpha_r | |||
| xvmaddadp vs27, vs51, alpha_r | |||
| stxvd2x vs8, o0, T2 | |||
| stxvd2x vs9, o16, T2 | |||
| stxvd2x vs10, o32, T2 | |||
| stxvd2x vs11, o48, T2 | |||
| xvmaddadp vs28, vs52, alpha_r | |||
| xvmaddadp vs29, vs53, alpha_r | |||
| xvmaddadp vs30, vs54, alpha_r | |||
| xvmaddadp vs31, vs55, alpha_r | |||
| stxvd2x vs12, o64, T2 | |||
| stxvd2x vs13, o80, T2 | |||
| stxvd2x vs14, o96, T2 | |||
| stxvd2x vs15, o112, T2 | |||
| xvmaddadp vs32, vs56, alpha_r | |||
| xvmaddadp vs33, vs57, alpha_r | |||
| xvmaddadp vs34, vs58, alpha_r | |||
| xvmaddadp vs35, vs59, alpha_r | |||
| stxvd2x vs24, 0, T3 | |||
| stxvd2x vs25, o16, T3 | |||
| stxvd2x vs26, o32, T3 | |||
| stxvd2x vs27, o48, T3 | |||
| xvmaddadp vs36, vs60, alpha_r | |||
| xvmaddadp vs37, vs61, alpha_r | |||
| xvmaddadp vs38, vs62, alpha_r | |||
| xvmaddadp vs39, vs63, alpha_r | |||
| stxvd2x vs28, o64, T3 | |||
| stxvd2x vs29, o80, T3 | |||
| stxvd2x vs30, o96, T3 | |||
| stxvd2x vs31, o112, T3 | |||
| stxvd2x vs32, o0, T4 | |||
| stxvd2x vs33, o16, T4 | |||
| stxvd2x vs34, o32, T4 | |||
| stxvd2x vs35, o48, T4 | |||
| stxvd2x vs8, 0, T3 | |||
| stxvd2x vs9, o16, T3 | |||
| stxvd2x vs10, o32, T3 | |||
| stxvd2x vs11, o48, T3 | |||
| addi CO, CO, 128 | |||
| stxvd2x vs12, 0, T4 | |||
| stxvd2x vs13, o16, T4 | |||
| stxvd2x vs14, o32, T4 | |||
| stxvd2x vs15, o48, T4 | |||
| stxvd2x vs36, o64, T4 | |||
| stxvd2x vs37, o80, T4 | |||
| stxvd2x vs38, o96, T4 | |||
| stxvd2x vs39, o112, T4 | |||
| addi CO, CO, 128 | |||
| .endm | |||
| @@ -0,0 +1,228 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define M r3 | |||
| #define N r4 | |||
| #define A r5 | |||
| #define LDA r6 | |||
| #define B r7 | |||
| #define A0 r8 | |||
| #define A1 r9 | |||
| #define A2 r10 | |||
| #define A3 r11 | |||
| #define J r12 | |||
| #define PREA r14 | |||
| #define PREB r15 | |||
| #define BO r16 | |||
| #define o64 r17 | |||
| #define o80 r18 | |||
| #define o96 r19 | |||
| #define o112 r20 | |||
| #define o8 r21 | |||
| #define T2 r22 | |||
| #define I r23 | |||
| #define o16 r24 | |||
| #define o32 r25 | |||
| #define o48 r26 | |||
| #define NOTU1 r27 | |||
| #define NOTU2 r30 | |||
| #define T1 r31 | |||
| #define o0 0 | |||
| #include "dgemm_ncopy_macros_4_power8.S" | |||
| #define STACKSIZE 384 | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| cmpwi cr0, M, 0 | |||
| ble- L999 | |||
| cmpwi cr0, N, 0 | |||
| ble- L999 | |||
| slwi LDA, LDA, BASE_SHIFT | |||
| li PREA, 384 | |||
| li PREB, 384 | |||
| li o8, 8 | |||
| li o16, 16 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| li o64, 64 | |||
| li o80, 80 | |||
| li o96, 96 | |||
| li o112, 112 | |||
| #include "dgemm_ncopy_logic_4_power8.S" | |||
| L999: | |||
| li r3, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| @@ -0,0 +1,237 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| mr BO, B | |||
| srawi. I, N, 2 | |||
| ble DCOPYN_L2_BEGIN | |||
| DCOPYN_L4_BEGIN: | |||
| DCOPYN_L4_LOOP: | |||
| mr A0, A | |||
| add A1, A0, LDA | |||
| add A2, A1, LDA | |||
| add A3, A2, LDA | |||
| add A, A3, LDA | |||
| DCOPYN_L4x16_BEGIN: | |||
| srawi. J, M, 4 | |||
| ble DCOPYN_L4x16_END | |||
| DCOPYN_L4x16_LOOP: | |||
| dcbt A0, PREA | |||
| dcbt A1, PREA | |||
| dcbt A2, PREA | |||
| dcbt A3, PREA | |||
| COPY_4x16 | |||
| addic. J, J, -1 | |||
| bgt DCOPYN_L4x16_LOOP | |||
| DCOPYN_L4x16_END: | |||
| DCOPYN_L4x8_BEGIN: | |||
| andi. J, M, 8 | |||
| ble DCOPYN_L4x8_END | |||
| COPY_4x8 | |||
| DCOPYN_L4x8_END: | |||
| DCOPYN_L4x4_BEGIN: | |||
| andi. J, M, 4 | |||
| ble DCOPYN_L4x4_END | |||
| COPY_4x4 | |||
| DCOPYN_L4x4_END: | |||
| DCOPYN_L4x2_BEGIN: | |||
| andi. J, M, 2 | |||
| ble DCOPYN_L4x2_END | |||
| COPY_4x2 | |||
| DCOPYN_L4x2_END: | |||
| DCOPYN_L4x1_BEGIN: | |||
| andi. J, M, 1 | |||
| ble DCOPYN_L4x1_END | |||
| COPY_4x1 | |||
| DCOPYN_L4x1_END: | |||
| DCOPYN_L4_END: | |||
| addic. I, I, -1 | |||
| bgt DCOPYN_L4_LOOP | |||
| DCOPYN_L2_BEGIN: | |||
| andi. T1, 4, 2 | |||
| ble DCOPYN_L2_END | |||
| DCOPYN_L2_LOOP: | |||
| mr A0, A | |||
| add A1, A0, LDA | |||
| add A, A1, LDA | |||
| DCOPYN_L2x16_BEGIN: | |||
| srawi. J, M, 4 | |||
| ble DCOPYN_L2x16_END | |||
| DCOPYN_L2x16_LOOP: | |||
| COPY_2x16 | |||
| addic. J, J, -1 | |||
| bgt DCOPYN_L2x16_LOOP | |||
| DCOPYN_L2x16_END: | |||
| DCOPYN_L2x8_BEGIN: | |||
| andi. J, M, 8 | |||
| ble DCOPYN_L2x8_END | |||
| COPY_2x8 | |||
| DCOPYN_L2x8_END: | |||
| DCOPYN_L2x4_BEGIN: | |||
| andi. J, M, 4 | |||
| ble DCOPYN_L2x4_END | |||
| COPY_2x4 | |||
| DCOPYN_L2x4_END: | |||
| DCOPYN_L2x2_BEGIN: | |||
| andi. J, M, 2 | |||
| ble DCOPYN_L2x2_END | |||
| COPY_2x2 | |||
| DCOPYN_L2x2_END: | |||
| DCOPYN_L2x1_BEGIN: | |||
| andi. J, M, 1 | |||
| ble DCOPYN_L2x1_END | |||
| COPY_2x1 | |||
| DCOPYN_L2x1_END: | |||
| DCOPYN_L2_END: | |||
| DCOPYN_L1_BEGIN: | |||
| andi. T1, 4, 1 | |||
| ble DCOPYN_L1_END | |||
| DCOPYN_L1_LOOP: | |||
| mr A0, A | |||
| add A, A0, LDA | |||
| DCOPYN_L1x16_BEGIN: | |||
| srawi. J, M, 4 | |||
| ble DCOPYN_L1x16_END | |||
| DCOPYN_L1x16_LOOP: | |||
| COPY_1x16 | |||
| addic. J, J, -1 | |||
| bgt DCOPYN_L1x16_LOOP | |||
| DCOPYN_L1x16_END: | |||
| DCOPYN_L1x8_BEGIN: | |||
| andi. J, M, 8 | |||
| ble DCOPYN_L1x8_END | |||
| COPY_1x8 | |||
| DCOPYN_L1x8_END: | |||
| DCOPYN_L1x4_BEGIN: | |||
| andi. J, M, 4 | |||
| ble DCOPYN_L1x4_END | |||
| COPY_1x4 | |||
| DCOPYN_L1x4_END: | |||
| DCOPYN_L1x2_BEGIN: | |||
| andi. J, M, 2 | |||
| ble DCOPYN_L1x2_END | |||
| COPY_1x2 | |||
| DCOPYN_L1x2_END: | |||
| DCOPYN_L1x1_BEGIN: | |||
| andi. J, M, 1 | |||
| ble DCOPYN_L1x1_END | |||
| COPY_1x1 | |||
| DCOPYN_L1x1_END: | |||
| DCOPYN_L1_END: | |||
| @@ -0,0 +1,691 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=16 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x16 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs8, o0, A1 | |||
| lxvd2x vs24, o0, A3 | |||
| lxvd2x vs16, o0, A2 | |||
| lxvd2x vs1, o16, A0 | |||
| lxvd2x vs9, o16, A1 | |||
| lxvd2x vs17, o16, A2 | |||
| lxvd2x vs25, o16, A3 | |||
| lxvd2x vs2, o32, A0 | |||
| lxvd2x vs10, o32, A1 | |||
| lxvd2x vs18, o32, A2 | |||
| lxvd2x vs26, o32, A3 | |||
| lxvd2x vs3, o48, A0 | |||
| lxvd2x vs11, o48, A1 | |||
| lxvd2x vs19, o48, A2 | |||
| lxvd2x vs27, o48, A3 | |||
| lxvd2x vs4, o64, A0 | |||
| lxvd2x vs12, o64, A1 | |||
| lxvd2x vs20, o64, A2 | |||
| lxvd2x vs28, o64, A3 | |||
| lxvd2x vs5, o80, A0 | |||
| lxvd2x vs13, o80, A1 | |||
| lxvd2x vs21, o80, A2 | |||
| lxvd2x vs29, o80, A3 | |||
| lxvd2x vs6, o96, A0 | |||
| lxvd2x vs14, o96, A1 | |||
| lxvd2x vs22, o96, A2 | |||
| lxvd2x vs30, o96, A3 | |||
| lxvd2x vs7, o112, A0 | |||
| lxvd2x vs15, o112, A1 | |||
| lxvd2x vs23, o112, A2 | |||
| lxvd2x vs31, o112, A3 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs16, vs24, 0 | |||
| xxpermdi vs34, vs0, vs8, 3 | |||
| xxpermdi vs35, vs16, vs24, 3 | |||
| xxpermdi vs36, vs1, vs9, 0 | |||
| xxpermdi vs37, vs17, vs25, 0 | |||
| xxpermdi vs38, vs1, vs9, 3 | |||
| xxpermdi vs39, vs17, vs25, 3 | |||
| xxpermdi vs40, vs2, vs10, 0 | |||
| xxpermdi vs41, vs18, vs26, 0 | |||
| xxpermdi vs42, vs2, vs10, 3 | |||
| xxpermdi vs43, vs18, vs26, 3 | |||
| xxpermdi vs44, vs3, vs11, 0 | |||
| xxpermdi vs45, vs19, vs27, 0 | |||
| xxpermdi vs46, vs3, vs11, 3 | |||
| xxpermdi vs47, vs19, vs27, 3 | |||
| xxpermdi vs48, vs4, vs12, 0 | |||
| xxpermdi vs49, vs20, vs28, 0 | |||
| xxpermdi vs50, vs4, vs12, 3 | |||
| xxpermdi vs51, vs20, vs28, 3 | |||
| xxpermdi vs52, vs5, vs13, 0 | |||
| xxpermdi vs53, vs21, vs29, 0 | |||
| xxpermdi vs54, vs5, vs13, 3 | |||
| xxpermdi vs55, vs21, vs29, 3 | |||
| addi A0, A0, 128 | |||
| addi A1, A1, 128 | |||
| xxpermdi vs56, vs6, vs14, 0 | |||
| xxpermdi vs57, vs22, vs30, 0 | |||
| xxpermdi vs58, vs6, vs14, 3 | |||
| xxpermdi vs59, vs22, vs30, 3 | |||
| addi A3, A3, 128 | |||
| addi A2, A2, 128 | |||
| xxpermdi vs60, vs7, vs15, 0 | |||
| xxpermdi vs61, vs23, vs31, 0 | |||
| xxpermdi vs62, vs7, vs15, 3 | |||
| xxpermdi vs63, vs23, vs31, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| stxvd2x vs36, o64, BO | |||
| stxvd2x vs37, o80, BO | |||
| stxvd2x vs38, o96, BO | |||
| stxvd2x vs39, o112, BO | |||
| addi BO, BO, 128 | |||
| stxvd2x vs40, o0, BO | |||
| stxvd2x vs41, o16, BO | |||
| stxvd2x vs42, o32, BO | |||
| stxvd2x vs43, o48, BO | |||
| stxvd2x vs44, o64, BO | |||
| stxvd2x vs45, o80, BO | |||
| stxvd2x vs46, o96, BO | |||
| stxvd2x vs47, o112, BO | |||
| addi BO, BO, 128 | |||
| stxvd2x vs48, o0, BO | |||
| stxvd2x vs49, o16, BO | |||
| stxvd2x vs50, o32, BO | |||
| stxvd2x vs51, o48, BO | |||
| stxvd2x vs52, o64, BO | |||
| stxvd2x vs53, o80, BO | |||
| stxvd2x vs54, o96, BO | |||
| stxvd2x vs55, o112, BO | |||
| addi BO, BO, 128 | |||
| stxvd2x vs56, o0, BO | |||
| stxvd2x vs57, o16, BO | |||
| stxvd2x vs58, o32, BO | |||
| stxvd2x vs59, o48, BO | |||
| stxvd2x vs60, o64, BO | |||
| stxvd2x vs61, o80, BO | |||
| stxvd2x vs62, o96, BO | |||
| stxvd2x vs63, o112, BO | |||
| addi BO, BO, 128 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x8 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| lxvd2x vs2, o32, A0 | |||
| lxvd2x vs3, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs8, o0, A1 | |||
| lxvd2x vs9, o16, A1 | |||
| lxvd2x vs10, o32, A1 | |||
| lxvd2x vs11, o48, A1 | |||
| addi A1, A1, 64 | |||
| lxvd2x vs16, o0, A2 | |||
| lxvd2x vs17, o16, A2 | |||
| lxvd2x vs18, o32, A2 | |||
| lxvd2x vs19, o48, A2 | |||
| addi A2, A2, 64 | |||
| lxvd2x vs24, o0, A3 | |||
| lxvd2x vs25, o16, A3 | |||
| lxvd2x vs26, o32, A3 | |||
| lxvd2x vs27, o48, A3 | |||
| addi A3, A3, 64 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs16, vs24, 0 | |||
| xxpermdi vs34, vs0, vs8, 3 | |||
| xxpermdi vs35, vs16, vs24, 3 | |||
| xxpermdi vs36, vs1, vs9, 0 | |||
| xxpermdi vs37, vs17, vs25, 0 | |||
| xxpermdi vs38, vs1, vs9, 3 | |||
| xxpermdi vs39, vs17, vs25, 3 | |||
| xxpermdi vs40, vs2, vs10, 0 | |||
| xxpermdi vs41, vs18, vs26, 0 | |||
| xxpermdi vs42, vs2, vs10, 3 | |||
| xxpermdi vs43, vs18, vs26, 3 | |||
| xxpermdi vs44, vs3, vs11, 0 | |||
| xxpermdi vs45, vs19, vs27, 0 | |||
| xxpermdi vs46, vs3, vs11, 3 | |||
| xxpermdi vs47, vs19, vs27, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| stxvd2x vs36, o64, BO | |||
| stxvd2x vs37, o80, BO | |||
| stxvd2x vs38, o96, BO | |||
| stxvd2x vs39, o112, BO | |||
| addi BO, BO, 128 | |||
| stxvd2x vs40, o0, BO | |||
| stxvd2x vs41, o16, BO | |||
| stxvd2x vs42, o32, BO | |||
| stxvd2x vs43, o48, BO | |||
| stxvd2x vs44, o64, BO | |||
| stxvd2x vs45, o80, BO | |||
| stxvd2x vs46, o96, BO | |||
| stxvd2x vs47, o112, BO | |||
| addi BO, BO, 128 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x4 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| addi A0, A0, 32 | |||
| lxvd2x vs8, o0, A1 | |||
| lxvd2x vs9, o16, A1 | |||
| addi A1, A1, 32 | |||
| lxvd2x vs16, o0, A2 | |||
| lxvd2x vs17, o16, A2 | |||
| addi A2, A2, 32 | |||
| lxvd2x vs24, o0, A3 | |||
| lxvd2x vs25, o16, A3 | |||
| addi A3, A3, 32 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs16, vs24, 0 | |||
| xxpermdi vs34, vs0, vs8, 3 | |||
| xxpermdi vs35, vs16, vs24, 3 | |||
| xxpermdi vs36, vs1, vs9, 0 | |||
| xxpermdi vs37, vs17, vs25, 0 | |||
| xxpermdi vs38, vs1, vs9, 3 | |||
| xxpermdi vs39, vs17, vs25, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| stxvd2x vs36, o64, BO | |||
| stxvd2x vs37, o80, BO | |||
| stxvd2x vs38, o96, BO | |||
| stxvd2x vs39, o112, BO | |||
| addi BO, BO, 128 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x2 | |||
| lxvd2x vs0, o0, A0 | |||
| addi A0, A0, 16 | |||
| lxvd2x vs8, o0, A1 | |||
| addi A1, A1, 16 | |||
| lxvd2x vs16, o0, A2 | |||
| addi A2, A2, 16 | |||
| lxvd2x vs24, o0, A3 | |||
| addi A3, A3, 16 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs16, vs24, 0 | |||
| xxpermdi vs34, vs0, vs8, 3 | |||
| xxpermdi vs35, vs16, vs24, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| addi BO, BO, 64 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=4 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_4x1 | |||
| lxsdx vs0, o0, A0 | |||
| addi A0, A0, 8 | |||
| lxsdx vs8, o0, A1 | |||
| addi A1, A1, 8 | |||
| lxsdx vs16, o0, A2 | |||
| addi A2, A2, 8 | |||
| lxsdx vs24, o0, A3 | |||
| addi A3, A3, 8 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs16, vs24, 0 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| addi BO, BO, 32 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=16 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x16 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| lxvd2x vs2, o32, A0 | |||
| lxvd2x vs3, o48, A0 | |||
| lxvd2x vs4, o64, A0 | |||
| lxvd2x vs5, o80, A0 | |||
| lxvd2x vs6, o96, A0 | |||
| lxvd2x vs7, o112, A0 | |||
| addi A0, A0, 128 | |||
| lxvd2x vs8, o0, A1 | |||
| lxvd2x vs9, o16, A1 | |||
| lxvd2x vs10, o32, A1 | |||
| lxvd2x vs11, o48, A1 | |||
| lxvd2x vs12, o64, A1 | |||
| lxvd2x vs13, o80, A1 | |||
| lxvd2x vs14, o96, A1 | |||
| lxvd2x vs15, o112, A1 | |||
| addi A1, A1, 128 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs0, vs8, 3 | |||
| xxpermdi vs34, vs1, vs9, 0 | |||
| xxpermdi vs35, vs1, vs9, 3 | |||
| xxpermdi vs36, vs2, vs10, 0 | |||
| xxpermdi vs37, vs2, vs10, 3 | |||
| xxpermdi vs38, vs3, vs11, 0 | |||
| xxpermdi vs39, vs3, vs11, 3 | |||
| xxpermdi vs40, vs4, vs12, 0 | |||
| xxpermdi vs41, vs4, vs12, 3 | |||
| xxpermdi vs42, vs5, vs13, 0 | |||
| xxpermdi vs43, vs5, vs13, 3 | |||
| xxpermdi vs44, vs6, vs14, 0 | |||
| xxpermdi vs45, vs6, vs14, 3 | |||
| xxpermdi vs46, vs7, vs15, 0 | |||
| xxpermdi vs47, vs7, vs15, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| stxvd2x vs36, o64, BO | |||
| stxvd2x vs37, o80, BO | |||
| stxvd2x vs38, o96, BO | |||
| stxvd2x vs39, o112, BO | |||
| addi BO, BO, 128 | |||
| stxvd2x vs40, o0, BO | |||
| stxvd2x vs41, o16, BO | |||
| stxvd2x vs42, o32, BO | |||
| stxvd2x vs43, o48, BO | |||
| stxvd2x vs44, o64, BO | |||
| stxvd2x vs45, o80, BO | |||
| stxvd2x vs46, o96, BO | |||
| stxvd2x vs47, o112, BO | |||
| addi BO, BO, 128 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x8 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| lxvd2x vs2, o32, A0 | |||
| lxvd2x vs3, o48, A0 | |||
| addi A0, A0, 64 | |||
| lxvd2x vs8, o0, A1 | |||
| lxvd2x vs9, o16, A1 | |||
| lxvd2x vs10, o32, A1 | |||
| lxvd2x vs11, o48, A1 | |||
| addi A1, A1, 64 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs0, vs8, 3 | |||
| xxpermdi vs34, vs1, vs9, 0 | |||
| xxpermdi vs35, vs1, vs9, 3 | |||
| xxpermdi vs36, vs2, vs10, 0 | |||
| xxpermdi vs37, vs2, vs10, 3 | |||
| xxpermdi vs38, vs3, vs11, 0 | |||
| xxpermdi vs39, vs3, vs11, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| stxvd2x vs36, o64, BO | |||
| stxvd2x vs37, o80, BO | |||
| stxvd2x vs38, o96, BO | |||
| stxvd2x vs39, o112, BO | |||
| addi BO, BO, 128 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x4 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| addi A0, A0, 32 | |||
| lxvd2x vs8, o0, A1 | |||
| lxvd2x vs9, o16, A1 | |||
| addi A1, A1, 32 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs0, vs8, 3 | |||
| xxpermdi vs34, vs1, vs9, 0 | |||
| xxpermdi vs35, vs1, vs9, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| stxvd2x vs34, o32, BO | |||
| stxvd2x vs35, o48, BO | |||
| addi BO, BO, 64 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x2 | |||
| lxvd2x vs0, o0, A0 | |||
| addi A0, A0, 16 | |||
| lxvd2x vs8, o0, A1 | |||
| addi A1, A1, 16 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| xxpermdi vs33, vs0, vs8, 3 | |||
| stxvd2x vs32, o0, BO | |||
| stxvd2x vs33, o16, BO | |||
| addi BO, BO, 32 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=2 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_2x1 | |||
| lxsdx vs0, o0, A0 | |||
| addi A0, A0, 8 | |||
| lxsdx vs8, o0, A1 | |||
| addi A1, A1, 8 | |||
| xxpermdi vs32, vs0, vs8, 0 | |||
| stxvd2x vs32, o0, BO | |||
| addi BO, BO, 16 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=16 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x16 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| lxvd2x vs2, o32, A0 | |||
| lxvd2x vs3, o48, A0 | |||
| lxvd2x vs4, o64, A0 | |||
| lxvd2x vs5, o80, A0 | |||
| lxvd2x vs6, o96, A0 | |||
| lxvd2x vs7, o112, A0 | |||
| addi A0, A0, 128 | |||
| stxvd2x vs0, o0, BO | |||
| stxvd2x vs1, o16, BO | |||
| stxvd2x vs2, o32, BO | |||
| stxvd2x vs3, o48, BO | |||
| addi BO, BO, 64 | |||
| stxvd2x vs4, o0, BO | |||
| stxvd2x vs5, o16, BO | |||
| stxvd2x vs6, o32, BO | |||
| stxvd2x vs7, o48, BO | |||
| addi BO, BO, 64 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=8 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x8 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| lxvd2x vs2, o32, A0 | |||
| lxvd2x vs3, o48, A0 | |||
| addi A0, A0, 64 | |||
| stxvd2x vs0, o0, BO | |||
| stxvd2x vs1, o16, BO | |||
| stxvd2x vs2, o32, BO | |||
| stxvd2x vs3, o48, BO | |||
| addi BO, BO, 64 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=4 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x4 | |||
| lxvd2x vs0, o0, A0 | |||
| lxvd2x vs1, o16, A0 | |||
| addi A0, A0, 32 | |||
| stxvd2x vs0, o0, BO | |||
| stxvd2x vs1, o16, BO | |||
| addi BO, BO, 32 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=2 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x2 | |||
| lxvd2x vs0, o0, A0 | |||
| addi A0, A0, 16 | |||
| stxvd2x vs0, o0, BO | |||
| addi BO, BO, 16 | |||
| .endm | |||
| /********************************************************************************************** | |||
| * Macros for N=1 and M=1 | |||
| **********************************************************************************************/ | |||
| .macro COPY_1x1 | |||
| lxsdx vs0, o0, A0 | |||
| addi A0, A0, 8 | |||
| stxsdx vs0, o0, BO | |||
| addi BO, BO, 8 | |||
| .endm | |||