| @@ -21,7 +21,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| DGEMMKERNEL = dgemm_kernel_16x4_power8.S | DGEMMKERNEL = dgemm_kernel_16x4_power8.S | ||||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | DGEMMINCOPY = ../generic/gemm_ncopy_16.c | ||||
| DGEMMITCOPY = dgemm_tcopy_16_power8.S | DGEMMITCOPY = dgemm_tcopy_16_power8.S | ||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | ||||
| DGEMMINCOPYOBJ = dgemm_incopy.o | DGEMMINCOPYOBJ = dgemm_incopy.o | ||||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | DGEMMITCOPYOBJ = dgemm_itcopy.o | ||||
| @@ -134,13 +134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define T4 r12 | #define T4 r12 | ||||
| #define T3 r11 | #define T3 r11 | ||||
| #define o40 r12 | |||||
| #define o56 r11 | |||||
| #define o112 r14 | |||||
| #define o8 r15 | #define o8 r15 | ||||
| #define o24 r16 | #define o24 r16 | ||||
| #define ALPHA r17 | |||||
| #define o64 r17 | |||||
| #define L r18 | #define L r18 | ||||
| #define T1 r19 | #define T1 r19 | ||||
| #define KK r20 | |||||
| #define BB r21 | |||||
| #define o80 r20 | |||||
| #define o96 r21 | |||||
| #define I r22 | #define I r22 | ||||
| #define J r23 | #define J r23 | ||||
| #define AO r24 | #define AO r24 | ||||
| @@ -205,6 +209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| std r17, 256(SP) | std r17, 256(SP) | ||||
| std r16, 264(SP) | std r16, 264(SP) | ||||
| std r15, 272(SP) | std r15, 272(SP) | ||||
| std r14, 280(SP) | |||||
| #else | #else | ||||
| stw r31, 144(SP) | stw r31, 144(SP) | ||||
| stw r30, 148(SP) | stw r30, 148(SP) | ||||
| @@ -223,6 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stw r17, 200(SP) | stw r17, 200(SP) | ||||
| stw r16, 204(SP) | stw r16, 204(SP) | ||||
| stw r15, 208(SP) | stw r15, 208(SP) | ||||
| stw r14, 212(SP) | |||||
| #endif | #endif | ||||
| stfd f1, ALPHA_SP | stfd f1, ALPHA_SP | ||||
| @@ -263,9 +269,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ble .L999_H1 | ble .L999_H1 | ||||
| #ifdef __64BIT__ | #ifdef __64BIT__ | ||||
| addi ALPHA, SP, 296 | |||||
| addi T1, SP, 296 | |||||
| #else | #else | ||||
| addi ALPHA, SP, 224 | |||||
| addi T1, SP, 224 | |||||
| #endif | #endif | ||||
| li PRE, 384 | li PRE, 384 | ||||
| @@ -274,8 +280,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| li o24, 24 | li o24, 24 | ||||
| li o32, 32 | li o32, 32 | ||||
| li o48, 48 | li o48, 48 | ||||
| li o64, 64 | |||||
| li o80, 80 | |||||
| li o96, 96 | |||||
| li o112, 112 | |||||
| lxvdsx alpha_r, 0, ALPHA | |||||
| lxvdsx alpha_r, 0, T1 | |||||
| #include "dgemm_logic_16x4_power8.S" | #include "dgemm_logic_16x4_power8.S" | ||||
| @@ -323,6 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld r17, 256(SP) | ld r17, 256(SP) | ||||
| ld r16, 264(SP) | ld r16, 264(SP) | ||||
| ld r15, 272(SP) | ld r15, 272(SP) | ||||
| ld r14, 280(SP) | |||||
| #else | #else | ||||
| lwz r31, 144(SP) | lwz r31, 144(SP) | ||||
| lwz r30, 148(SP) | lwz r30, 148(SP) | ||||
| @@ -341,6 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| lwz r17, 200(SP) | lwz r17, 200(SP) | ||||
| lwz r16, 204(SP) | lwz r16, 204(SP) | ||||
| lwz r15, 208(SP) | lwz r15, 208(SP) | ||||
| lwz r14, 212(SP) | |||||
| #endif | #endif | ||||
| addi SP, SP, STACKSIZE | addi SP, SP, STACKSIZE | ||||
| @@ -46,23 +46,28 @@ LDGEMM_L4_BEGIN: | |||||
| srawi. I, M, 4 | srawi. I, M, 4 | ||||
| ble LDGEMM_L4x16_END | ble LDGEMM_L4x16_END | ||||
| .align 5 | |||||
| .align 4 | |||||
| LDGEMM_L4x16_BEGIN: | LDGEMM_L4x16_BEGIN: | ||||
| li T4, -128 | |||||
| li L, -128 | |||||
| and T1, CO, T4 | |||||
| mr T1, CO | |||||
| add T2, T1, LDC | add T2, T1, LDC | ||||
| add T3, T2, LDC | add T3, T2, LDC | ||||
| add T4, T3, LDC | add T4, T3, LDC | ||||
| and T1, T1, L | |||||
| and T2, T2, L | |||||
| and T3, T3, L | |||||
| and T4, T4, L | |||||
| dcbt T1, r0 | dcbt T1, r0 | ||||
| dcbt T2, r0 | dcbt T2, r0 | ||||
| dcbt T3, r0 | dcbt T3, r0 | ||||
| dcbt T4, r0 | dcbt T4, r0 | ||||
| andi. cr0, CO, 127 | |||||
| ble LDGEMM_L4x16_BEGIN_NOPRE | |||||
| mr BO, B | |||||
| srawi. L, K, 1 | |||||
| addi T1, T1, 128 | addi T1, T1, 128 | ||||
| addi T2, T2, 128 | addi T2, T2, 128 | ||||
| @@ -74,55 +79,43 @@ LDGEMM_L4x16_BEGIN: | |||||
| dcbt T3, r0 | dcbt T3, r0 | ||||
| dcbt T4, r0 | dcbt T4, r0 | ||||
| LDGEMM_L4x16_BEGIN_NOPRE: | |||||
| mr BO, B | |||||
| srawi. L, K, 2 | |||||
| ble LDGEMM_L4x16_SUB0 | ble LDGEMM_L4x16_SUB0 | ||||
| cmpwi cr0, L, 1 | cmpwi cr0, L, 1 | ||||
| ble LDGEMM_L4x16_SUB4 | ble LDGEMM_L4x16_SUB4 | ||||
| .align 5 | |||||
| .align 4 | |||||
| LDGEMM_L4x16_LOOP_START: | LDGEMM_L4x16_LOOP_START: | ||||
| li o40, 40 | |||||
| li o56, 56 | |||||
| dcbt AO, PRE | dcbt AO, PRE | ||||
| LOAD4x16_1 | LOAD4x16_1 | ||||
| dcbt AO, PRE | dcbt AO, PRE | ||||
| KERNEL4x16_I1 | KERNEL4x16_I1 | ||||
| dcbt AO, PRE | dcbt AO, PRE | ||||
| KERNEL4x16_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL4x16_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL4x16_2 | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| KERNEL4x16_L2 | |||||
| ble LDGEMM_L4x16_LOOP_END | ble LDGEMM_L4x16_LOOP_END | ||||
| .align 7 | |||||
| .align 4 | |||||
| LDGEMM_L4x16_LOOP: | LDGEMM_L4x16_LOOP: | ||||
| dcbt AO, PRE | dcbt AO, PRE | ||||
| KERNEL4x16_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL4x16_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL4x16_1 | |||||
| KERNEL4x16_L1 | |||||
| dcbt AO, PRE | dcbt AO, PRE | ||||
| KERNEL4x16_2 | |||||
| addic. L, L, -1 | addic. L, L, -1 | ||||
| KERNEL4x16_L2 | |||||
| bgt LDGEMM_L4x16_LOOP | bgt LDGEMM_L4x16_LOOP | ||||
| .align 5 | |||||
| .align 4 | |||||
| LDGEMM_L4x16_LOOP_END: | LDGEMM_L4x16_LOOP_END: | ||||
| dcbt AO, PRE | |||||
| KERNEL4x16_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL4x16_2 | |||||
| KERNEL4x16_1 | KERNEL4x16_1 | ||||
| KERNEL4x16_E2 | KERNEL4x16_E2 | ||||
| @@ -132,14 +125,12 @@ LDGEMM_L4x16_SUB4: | |||||
| KERNEL4x16_SUBI1 | KERNEL4x16_SUBI1 | ||||
| KERNEL4x16_SUB1 | KERNEL4x16_SUB1 | ||||
| KERNEL4x16_SUB1 | |||||
| KERNEL4x16_SUB1 | |||||
| b LDGEMM_L4x16_SUB1 | b LDGEMM_L4x16_SUB1 | ||||
| LDGEMM_L4x16_SUB0: | LDGEMM_L4x16_SUB0: | ||||
| andi. L, K, 3 | |||||
| andi. L, K, 1 | |||||
| KERNEL4x16_SUBI1 | KERNEL4x16_SUBI1 | ||||
| @@ -149,7 +140,7 @@ LDGEMM_L4x16_SUB0: | |||||
| LDGEMM_L4x16_SUB1: | LDGEMM_L4x16_SUB1: | ||||
| andi. L, K, 3 | |||||
| andi. L, K, 1 | |||||
| ble LDGEMM_L4x16_SAVE | ble LDGEMM_L4x16_SAVE | ||||
| LDGEMM_L4x16_SUB2: | LDGEMM_L4x16_SUB2: | ||||
| @@ -159,7 +150,7 @@ LDGEMM_L4x16_SUB2: | |||||
| addic. L, L, -1 | addic. L, L, -1 | ||||
| bgt LDGEMM_L4x16_SUB2 | bgt LDGEMM_L4x16_SUB2 | ||||
| .align 5 | |||||
| .align 4 | |||||
| LDGEMM_L4x16_SAVE: | LDGEMM_L4x16_SAVE: | ||||
| SAVE4x16 | SAVE4x16 | ||||
| @@ -184,15 +175,20 @@ LDGEMM_L4x8_BEGIN: | |||||
| LDGEMM_L4x8_LOOP_START: | LDGEMM_L4x8_LOOP_START: | ||||
| dcbt AO, PRE | |||||
| LOAD4x8_1 | LOAD4x8_1 | ||||
| KERNEL4x8_I1 | KERNEL4x8_I1 | ||||
| dcbt AO, PRE | |||||
| KERNEL4x8_2 | KERNEL4x8_2 | ||||
| KERNEL4x8_1 | KERNEL4x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL4x8_2 | KERNEL4x8_2 | ||||
| KERNEL4x8_1 | KERNEL4x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL4x8_2 | KERNEL4x8_2 | ||||
| KERNEL4x8_1 | KERNEL4x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL4x8_2 | KERNEL4x8_2 | ||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| @@ -203,13 +199,17 @@ LDGEMM_L4x8_LOOP_START: | |||||
| LDGEMM_L4x8_LOOP: | LDGEMM_L4x8_LOOP: | ||||
| KERNEL4x8_1 | KERNEL4x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL4x8_2 | KERNEL4x8_2 | ||||
| KERNEL4x8_1 | KERNEL4x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL4x8_2 | KERNEL4x8_2 | ||||
| KERNEL4x8_1 | KERNEL4x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL4x8_2 | KERNEL4x8_2 | ||||
| KERNEL4x8_1 | KERNEL4x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL4x8_2 | KERNEL4x8_2 | ||||
| addic. L, L, -1 | addic. L, L, -1 | ||||
| @@ -284,15 +284,18 @@ LDGEMM_L4x4_BEGIN: | |||||
| LDGEMM_L4x4_LOOP_START: | LDGEMM_L4x4_LOOP_START: | ||||
| dcbt AO, PRE | |||||
| LOAD4x4_1 | LOAD4x4_1 | ||||
| KERNEL4x4_I1 | KERNEL4x4_I1 | ||||
| KERNEL4x4_2 | KERNEL4x4_2 | ||||
| KERNEL4x4_1 | KERNEL4x4_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL4x4_2 | KERNEL4x4_2 | ||||
| KERNEL4x4_1 | KERNEL4x4_1 | ||||
| KERNEL4x4_2 | KERNEL4x4_2 | ||||
| KERNEL4x4_1 | KERNEL4x4_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL4x4_2 | KERNEL4x4_2 | ||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| @@ -305,11 +308,13 @@ LDGEMM_L4x4_LOOP: | |||||
| KERNEL4x4_1 | KERNEL4x4_1 | ||||
| KERNEL4x4_2 | KERNEL4x4_2 | ||||
| KERNEL4x4_1 | KERNEL4x4_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL4x4_2 | KERNEL4x4_2 | ||||
| KERNEL4x4_1 | KERNEL4x4_1 | ||||
| KERNEL4x4_2 | KERNEL4x4_2 | ||||
| KERNEL4x4_1 | KERNEL4x4_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL4x4_2 | KERNEL4x4_2 | ||||
| addic. L, L, -1 | addic. L, L, -1 | ||||
| @@ -743,15 +748,20 @@ LDGEMM_L2x8_BEGIN: | |||||
| LDGEMM_L2x8_LOOP_START: | LDGEMM_L2x8_LOOP_START: | ||||
| dcbt AO, PRE | |||||
| LOAD2x8_1 | LOAD2x8_1 | ||||
| KERNEL2x8_I1 | KERNEL2x8_I1 | ||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | KERNEL2x8_2 | ||||
| KERNEL2x8_1 | KERNEL2x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | KERNEL2x8_2 | ||||
| KERNEL2x8_1 | KERNEL2x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | KERNEL2x8_2 | ||||
| KERNEL2x8_1 | KERNEL2x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | KERNEL2x8_2 | ||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| @@ -762,13 +772,17 @@ LDGEMM_L2x8_LOOP_START: | |||||
| LDGEMM_L2x8_LOOP: | LDGEMM_L2x8_LOOP: | ||||
| KERNEL2x8_1 | KERNEL2x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | KERNEL2x8_2 | ||||
| KERNEL2x8_1 | KERNEL2x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | KERNEL2x8_2 | ||||
| KERNEL2x8_1 | KERNEL2x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | KERNEL2x8_2 | ||||
| KERNEL2x8_1 | KERNEL2x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | KERNEL2x8_2 | ||||
| addic. L, L, -1 | addic. L, L, -1 | ||||
| @@ -1287,15 +1301,20 @@ LDGEMM_L1x8_BEGIN: | |||||
| LDGEMM_L1x8_LOOP_START: | LDGEMM_L1x8_LOOP_START: | ||||
| dcbt AO, PRE | |||||
| LOAD1x8_1 | LOAD1x8_1 | ||||
| KERNEL1x8_I1 | KERNEL1x8_I1 | ||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | KERNEL1x8_2 | ||||
| KERNEL1x8_1 | KERNEL1x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | KERNEL1x8_2 | ||||
| KERNEL1x8_1 | KERNEL1x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | KERNEL1x8_2 | ||||
| KERNEL1x8_1 | KERNEL1x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | KERNEL1x8_2 | ||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| @@ -1306,13 +1325,17 @@ LDGEMM_L1x8_LOOP_START: | |||||
| LDGEMM_L1x8_LOOP: | LDGEMM_L1x8_LOOP: | ||||
| KERNEL1x8_1 | KERNEL1x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | KERNEL1x8_2 | ||||
| KERNEL1x8_1 | KERNEL1x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | KERNEL1x8_2 | ||||
| KERNEL1x8_1 | KERNEL1x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | KERNEL1x8_2 | ||||
| KERNEL1x8_1 | KERNEL1x8_1 | ||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | KERNEL1x8_2 | ||||
| addic. L, L, -1 | addic. L, L, -1 | ||||
| @@ -47,88 +47,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| lxvdsx vs24, 0, BO | lxvdsx vs24, 0, BO | ||||
| lxvdsx vs25, o8, BO | lxvdsx vs25, o8, BO | ||||
| addi AO, AO, 64 | |||||
| lxvd2x vs4, 0, AO | |||||
| lxvd2x vs5, o16, AO | |||||
| lxvd2x vs6, o32, AO | |||||
| lxvd2x vs7, o48, AO | |||||
| lxvd2x vs4, o64, AO | |||||
| lxvd2x vs5, o80, AO | |||||
| lxvd2x vs6, o96, AO | |||||
| lxvd2x vs7, o112, AO | |||||
| lxvdsx vs26, o16, BO | lxvdsx vs26, o16, BO | ||||
| lxvdsx vs27, o24, BO | lxvdsx vs27, o24, BO | ||||
| addi AO, AO, 64 | |||||
| addi AO, AO, 128 | |||||
| addi BO, BO, 32 | addi BO, BO, 32 | ||||
| .endm | .endm | ||||
| .macro KERNEL4x16_I1 | .macro KERNEL4x16_I1 | ||||
| xvmuldp vs32, vs0, vs24 | |||||
| xvmuldp vs33, vs1, vs24 | |||||
| xvmuldp vs34, vs2, vs24 | |||||
| xvmuldp vs35, vs3, vs24 | |||||
| xvmuldp vs32, vs0, vs24 | |||||
| xvmuldp vs33, vs1, vs24 | |||||
| xvmuldp vs34, vs2, vs24 | |||||
| xvmuldp vs35, vs3, vs24 | |||||
| lxvd2x vs8, 0, AO | |||||
| lxvd2x vs8, o0, AO | |||||
| lxvd2x vs9, o16, AO | lxvd2x vs9, o16, AO | ||||
| lxvd2x vs10, o32, AO | |||||
| lxvd2x vs11, o48, AO | |||||
| xvmuldp vs36, vs4, vs24 | |||||
| xvmuldp vs37, vs5, vs24 | |||||
| xvmuldp vs38, vs6, vs24 | |||||
| xvmuldp vs39, vs7, vs24 | |||||
| xvmuldp vs36, vs4, vs24 | |||||
| xvmuldp vs37, vs5, vs24 | |||||
| xvmuldp vs38, vs6, vs24 | |||||
| xvmuldp vs39, vs7, vs24 | |||||
| lxvdsx vs28, 0, BO | lxvdsx vs28, 0, BO | ||||
| lxvdsx vs29, o8, BO | lxvdsx vs29, o8, BO | ||||
| xvmuldp vs40, vs0, vs25 | |||||
| xvmuldp vs41, vs1, vs25 | |||||
| xvmuldp vs42, vs2, vs25 | |||||
| xvmuldp vs43, vs3, vs25 | |||||
| xvmuldp vs40, vs0, vs25 | |||||
| xvmuldp vs41, vs1, vs25 | |||||
| xvmuldp vs42, vs2, vs25 | |||||
| xvmuldp vs43, vs3, vs25 | |||||
| lxvd2x vs10, o32, AO | |||||
| lxvd2x vs11, o48, AO | |||||
| xvmuldp vs44, vs4, vs25 | |||||
| xvmuldp vs45, vs5, vs25 | |||||
| xvmuldp vs46, vs6, vs25 | |||||
| xvmuldp vs47, vs7, vs25 | |||||
| xvmuldp vs44, vs4, vs25 | |||||
| xvmuldp vs45, vs5, vs25 | |||||
| xvmuldp vs46, vs6, vs25 | |||||
| xvmuldp vs47, vs7, vs25 | |||||
| addi AO, AO, 64 | |||||
| xvmuldp vs48, vs0, vs26 | |||||
| xvmuldp vs49, vs1, vs26 | |||||
| xvmuldp vs50, vs2, vs26 | |||||
| xvmuldp vs51, vs3, vs26 | |||||
| xvmuldp vs48, vs0, vs26 | |||||
| xvmuldp vs49, vs1, vs26 | |||||
| xvmuldp vs50, vs2, vs26 | |||||
| xvmuldp vs51, vs3, vs26 | |||||
| lxvd2x vs12, 0, AO | |||||
| lxvd2x vs13, o16, AO | |||||
| lxvd2x vs12, o64, AO | |||||
| lxvd2x vs13, o80, AO | |||||
| xvmuldp vs52, vs4, vs26 | |||||
| xvmuldp vs53, vs5, vs26 | |||||
| xvmuldp vs54, vs6, vs26 | |||||
| xvmuldp vs55, vs7, vs26 | |||||
| xvmuldp vs52, vs4, vs26 | |||||
| xvmuldp vs53, vs5, vs26 | |||||
| xvmuldp vs54, vs6, vs26 | |||||
| xvmuldp vs55, vs7, vs26 | |||||
| lxvd2x vs14, o32, AO | |||||
| lxvd2x vs15, o48, AO | |||||
| lxvd2x vs14, o96, AO | |||||
| lxvd2x vs15, o112, AO | |||||
| xvmuldp vs56, vs0, vs27 | |||||
| xvmuldp vs57, vs1, vs27 | |||||
| xvmuldp vs58, vs2, vs27 | |||||
| xvmuldp vs59, vs3, vs27 | |||||
| xvmuldp vs56, vs0, vs27 | |||||
| xvmuldp vs57, vs1, vs27 | |||||
| xvmuldp vs58, vs2, vs27 | |||||
| xvmuldp vs59, vs3, vs27 | |||||
| lxvdsx vs30, o16, BO | lxvdsx vs30, o16, BO | ||||
| lxvdsx vs31, o24, BO | lxvdsx vs31, o24, BO | ||||
| xvmuldp vs60, vs4, vs27 | |||||
| xvmuldp vs61, vs5, vs27 | |||||
| xvmuldp vs62, vs6, vs27 | |||||
| xvmuldp vs63, vs7, vs27 | |||||
| xvmuldp vs60, vs4, vs27 | |||||
| xvmuldp vs61, vs5, vs27 | |||||
| xvmuldp vs62, vs6, vs27 | |||||
| xvmuldp vs63, vs7, vs27 | |||||
| addi AO, AO, 64 | |||||
| addi BO, BO, 32 | |||||
| addi AO, AO, 128 | |||||
| .endm | .endm | ||||
| .macro KERNEL4x16_1 | .macro KERNEL4x16_1 | ||||
| xvmaddadp vs32, vs0, vs24 | xvmaddadp vs32, vs0, vs24 | ||||
| @@ -136,8 +136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvmaddadp vs34, vs2, vs24 | xvmaddadp vs34, vs2, vs24 | ||||
| xvmaddadp vs35, vs3, vs24 | xvmaddadp vs35, vs3, vs24 | ||||
| lxvd2x vs8, 0, AO | |||||
| lxvd2x vs8, o0, AO | |||||
| lxvd2x vs9, o16, AO | lxvd2x vs9, o16, AO | ||||
| lxvd2x vs10, o32, AO | |||||
| lxvd2x vs11, o48, AO | |||||
| xvmaddadp vs36, vs4, vs24 | xvmaddadp vs36, vs4, vs24 | ||||
| xvmaddadp vs37, vs5, vs24 | xvmaddadp vs37, vs5, vs24 | ||||
| @@ -152,31 +154,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvmaddadp vs42, vs2, vs25 | xvmaddadp vs42, vs2, vs25 | ||||
| xvmaddadp vs43, vs3, vs25 | xvmaddadp vs43, vs3, vs25 | ||||
| lxvd2x vs10, o32, AO | |||||
| lxvd2x vs11, o48, AO | |||||
| xvmaddadp vs44, vs4, vs25 | xvmaddadp vs44, vs4, vs25 | ||||
| xvmaddadp vs45, vs5, vs25 | xvmaddadp vs45, vs5, vs25 | ||||
| xvmaddadp vs46, vs6, vs25 | xvmaddadp vs46, vs6, vs25 | ||||
| xvmaddadp vs47, vs7, vs25 | xvmaddadp vs47, vs7, vs25 | ||||
| addi AO, AO, 64 | |||||
| xvmaddadp vs48, vs0, vs26 | xvmaddadp vs48, vs0, vs26 | ||||
| xvmaddadp vs49, vs1, vs26 | xvmaddadp vs49, vs1, vs26 | ||||
| xvmaddadp vs50, vs2, vs26 | xvmaddadp vs50, vs2, vs26 | ||||
| xvmaddadp vs51, vs3, vs26 | xvmaddadp vs51, vs3, vs26 | ||||
| lxvd2x vs12, 0, AO | |||||
| lxvd2x vs13, o16, AO | |||||
| lxvd2x vs12, o64, AO | |||||
| lxvd2x vs13, o80, AO | |||||
| xvmaddadp vs52, vs4, vs26 | xvmaddadp vs52, vs4, vs26 | ||||
| xvmaddadp vs53, vs5, vs26 | xvmaddadp vs53, vs5, vs26 | ||||
| xvmaddadp vs54, vs6, vs26 | xvmaddadp vs54, vs6, vs26 | ||||
| xvmaddadp vs55, vs7, vs26 | xvmaddadp vs55, vs7, vs26 | ||||
| lxvd2x vs14, o32, AO | |||||
| lxvd2x vs15, o48, AO | |||||
| lxvd2x vs14, o96, AO | |||||
| lxvd2x vs15, o112, AO | |||||
| xvmaddadp vs56, vs0, vs27 | xvmaddadp vs56, vs0, vs27 | ||||
| xvmaddadp vs57, vs1, vs27 | xvmaddadp vs57, vs1, vs27 | ||||
| @@ -192,7 +191,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvmaddadp vs62, vs6, vs27 | xvmaddadp vs62, vs6, vs27 | ||||
| xvmaddadp vs63, vs7, vs27 | xvmaddadp vs63, vs7, vs27 | ||||
| addi AO, AO, 64 | |||||
| addi AO, AO, 128 | |||||
| addi BO, BO, 32 | addi BO, BO, 32 | ||||
| .endm | .endm | ||||
| @@ -228,23 +227,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvmaddadp vs46, vs14, vs29 | xvmaddadp vs46, vs14, vs29 | ||||
| xvmaddadp vs47, vs15, vs29 | xvmaddadp vs47, vs15, vs29 | ||||
| addi AO, AO, 64 | |||||
| xvmaddadp vs48, vs8, vs30 | xvmaddadp vs48, vs8, vs30 | ||||
| xvmaddadp vs49, vs9, vs30 | xvmaddadp vs49, vs9, vs30 | ||||
| xvmaddadp vs50, vs10, vs30 | xvmaddadp vs50, vs10, vs30 | ||||
| xvmaddadp vs51, vs11, vs30 | xvmaddadp vs51, vs11, vs30 | ||||
| lxvd2x vs4, 0, AO | |||||
| lxvd2x vs5, o16, AO | |||||
| lxvd2x vs4, o64, AO | |||||
| lxvd2x vs5, o80, AO | |||||
| xvmaddadp vs52, vs12, vs30 | xvmaddadp vs52, vs12, vs30 | ||||
| xvmaddadp vs53, vs13, vs30 | xvmaddadp vs53, vs13, vs30 | ||||
| xvmaddadp vs54, vs14, vs30 | xvmaddadp vs54, vs14, vs30 | ||||
| xvmaddadp vs55, vs15, vs30 | xvmaddadp vs55, vs15, vs30 | ||||
| lxvd2x vs6, o32, AO | |||||
| lxvd2x vs7, o48, AO | |||||
| lxvd2x vs6, o96, AO | |||||
| lxvd2x vs7, o112, AO | |||||
| xvmaddadp vs56, vs8, vs31 | xvmaddadp vs56, vs8, vs31 | ||||
| xvmaddadp vs57, vs9, vs31 | xvmaddadp vs57, vs9, vs31 | ||||
| @@ -259,11 +257,144 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvmaddadp vs62, vs14, vs31 | xvmaddadp vs62, vs14, vs31 | ||||
| xvmaddadp vs63, vs15, vs31 | xvmaddadp vs63, vs15, vs31 | ||||
| addi AO, AO, 64 | |||||
| addi AO, AO, 128 | |||||
| addi BO, BO, 32 | addi BO, BO, 32 | ||||
| .endm | .endm | ||||
| .macro KERNEL4x16_L1 | |||||
| xvmaddadp vs32, vs0, vs24 | |||||
| xvmaddadp vs33, vs1, vs24 | |||||
| xvmaddadp vs34, vs2, vs24 | |||||
| xvmaddadp vs35, vs3, vs24 | |||||
| lxvd2x vs8, o0, AO | |||||
| lxvd2x vs9, o16, AO | |||||
| lxvd2x vs10, o32, AO | |||||
| lxvd2x vs11, o48, AO | |||||
| xvmaddadp vs36, vs4, vs24 | |||||
| xvmaddadp vs37, vs5, vs24 | |||||
| xvmaddadp vs38, vs6, vs24 | |||||
| xvmaddadp vs39, vs7, vs24 | |||||
| lxvdsx vs28, 0, BO | |||||
| lxvdsx vs29, o8, BO | |||||
| xvmaddadp vs40, vs0, vs25 | |||||
| xvmaddadp vs41, vs1, vs25 | |||||
| xvmaddadp vs42, vs2, vs25 | |||||
| xvmaddadp vs43, vs3, vs25 | |||||
| xvmaddadp vs44, vs4, vs25 | |||||
| xvmaddadp vs45, vs5, vs25 | |||||
| xvmaddadp vs46, vs6, vs25 | |||||
| xvmaddadp vs47, vs7, vs25 | |||||
| xvmaddadp vs48, vs0, vs26 | |||||
| xvmaddadp vs49, vs1, vs26 | |||||
| xvmaddadp vs50, vs2, vs26 | |||||
| xvmaddadp vs51, vs3, vs26 | |||||
| lxvd2x vs12, o64, AO | |||||
| lxvd2x vs13, o80, AO | |||||
| xvmaddadp vs52, vs4, vs26 | |||||
| xvmaddadp vs53, vs5, vs26 | |||||
| xvmaddadp vs54, vs6, vs26 | |||||
| xvmaddadp vs55, vs7, vs26 | |||||
| lxvd2x vs14, o96, AO | |||||
| lxvd2x vs15, o112, AO | |||||
| xvmaddadp vs56, vs0, vs27 | |||||
| xvmaddadp vs57, vs1, vs27 | |||||
| xvmaddadp vs58, vs2, vs27 | |||||
| xvmaddadp vs59, vs3, vs27 | |||||
| lxvdsx vs30, o16, BO | |||||
| lxvdsx vs31, o24, BO | |||||
| xvmaddadp vs60, vs4, vs27 | |||||
| xvmaddadp vs61, vs5, vs27 | |||||
| xvmaddadp vs62, vs6, vs27 | |||||
| xvmaddadp vs63, vs7, vs27 | |||||
| addi AO, AO, 128 | |||||
| .endm | |||||
| .macro KERNEL4x16_L2 | |||||
| xvmaddadp vs32, vs8, vs28 | |||||
| xvmaddadp vs33, vs9, vs28 | |||||
| xvmaddadp vs34, vs10, vs28 | |||||
| xvmaddadp vs35, vs11, vs28 | |||||
| lxvd2x vs0, 0, AO | |||||
| lxvd2x vs1, o16, AO | |||||
| xvmaddadp vs36, vs12, vs28 | |||||
| xvmaddadp vs37, vs13, vs28 | |||||
| xvmaddadp vs38, vs14, vs28 | |||||
| xvmaddadp vs39, vs15, vs28 | |||||
| lxvdsx vs24, o32, BO | |||||
| lxvdsx vs25, o40, BO | |||||
| xvmaddadp vs40, vs8, vs29 | |||||
| xvmaddadp vs41, vs9, vs29 | |||||
| xvmaddadp vs42, vs10, vs29 | |||||
| xvmaddadp vs43, vs11, vs29 | |||||
| lxvd2x vs2, o32, AO | |||||
| lxvd2x vs3, o48, AO | |||||
| xvmaddadp vs44, vs12, vs29 | |||||
| xvmaddadp vs45, vs13, vs29 | |||||
| xvmaddadp vs46, vs14, vs29 | |||||
| xvmaddadp vs47, vs15, vs29 | |||||
| xvmaddadp vs48, vs8, vs30 | |||||
| xvmaddadp vs49, vs9, vs30 | |||||
| xvmaddadp vs50, vs10, vs30 | |||||
| xvmaddadp vs51, vs11, vs30 | |||||
| lxvd2x vs4, o64, AO | |||||
| lxvd2x vs5, o80, AO | |||||
| xvmaddadp vs52, vs12, vs30 | |||||
| xvmaddadp vs53, vs13, vs30 | |||||
| xvmaddadp vs54, vs14, vs30 | |||||
| xvmaddadp vs55, vs15, vs30 | |||||
| lxvd2x vs6, o96, AO | |||||
| lxvd2x vs7, o112, AO | |||||
| xvmaddadp vs56, vs8, vs31 | |||||
| xvmaddadp vs57, vs9, vs31 | |||||
| xvmaddadp vs58, vs10, vs31 | |||||
| xvmaddadp vs59, vs11, vs31 | |||||
| lxvdsx vs26, o48, BO | |||||
| lxvdsx vs27, o56, BO | |||||
| xvmaddadp vs60, vs12, vs31 | |||||
| addi AO, AO, 128 | |||||
| xvmaddadp vs61, vs13, vs31 | |||||
| xvmaddadp vs62, vs14, vs31 | |||||
| addi BO, BO, 64 | |||||
| xvmaddadp vs63, vs15, vs31 | |||||
| .endm | |||||
| .macro KERNEL4x16_E2 | .macro KERNEL4x16_E2 | ||||
| @@ -378,15 +509,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| lxvdsx vs26, o16, BO | lxvdsx vs26, o16, BO | ||||
| lxvdsx vs27, o24, BO | lxvdsx vs27, o24, BO | ||||
| addi AO, AO, 64 | |||||
| addi BO, BO, 32 | |||||
| lxvd2x vs4, 0, AO | |||||
| lxvd2x vs5, o16, AO | |||||
| lxvd2x vs6, o32, AO | |||||
| lxvd2x vs7, o48, AO | |||||
| lxvd2x vs4, o64, AO | |||||
| lxvd2x vs5, o80, AO | |||||
| lxvd2x vs6, o96, AO | |||||
| lxvd2x vs7, o112, AO | |||||
| addi AO, AO, 64 | |||||
| xvmaddadp vs32, vs0, vs24 | xvmaddadp vs32, vs0, vs24 | ||||
| @@ -402,6 +530,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvmaddadp vs41, vs1, vs25 | xvmaddadp vs41, vs1, vs25 | ||||
| xvmaddadp vs42, vs2, vs25 | xvmaddadp vs42, vs2, vs25 | ||||
| xvmaddadp vs43, vs3, vs25 | xvmaddadp vs43, vs3, vs25 | ||||
| addi BO, BO, 32 | |||||
| xvmaddadp vs44, vs4, vs25 | xvmaddadp vs44, vs4, vs25 | ||||
| xvmaddadp vs45, vs5, vs25 | xvmaddadp vs45, vs5, vs25 | ||||
| xvmaddadp vs46, vs6, vs25 | xvmaddadp vs46, vs6, vs25 | ||||
| @@ -411,6 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvmaddadp vs49, vs1, vs26 | xvmaddadp vs49, vs1, vs26 | ||||
| xvmaddadp vs50, vs2, vs26 | xvmaddadp vs50, vs2, vs26 | ||||
| xvmaddadp vs51, vs3, vs26 | xvmaddadp vs51, vs3, vs26 | ||||
| addi AO, AO, 128 | |||||
| xvmaddadp vs52, vs4, vs26 | xvmaddadp vs52, vs4, vs26 | ||||
| xvmaddadp vs53, vs5, vs26 | xvmaddadp vs53, vs5, vs26 | ||||
| xvmaddadp vs54, vs6, vs26 | xvmaddadp vs54, vs6, vs26 | ||||
| @@ -430,33 +560,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE4x16 | .macro SAVE4x16 | ||||
| mr T1, CO | mr T1, CO | ||||
| addi T2, T1, 64 | |||||
| add T3, T1, LDC | |||||
| addi T4, T3, 64 | |||||
| #ifndef TRMMKERNEL | |||||
| lxvd2x vs0, 0, T1 | |||||
| lxvd2x vs1, o16, T1 | |||||
| lxvd2x vs2, o32, T1 | |||||
| lxvd2x vs3, o48, T1 | |||||
| lxvd2x vs4, 0, T2 | |||||
| lxvd2x vs5, o16, T2 | |||||
| lxvd2x vs6, o32, T2 | |||||
| lxvd2x vs7, o48, T2 | |||||
| lxvd2x vs8, 0, T3 | |||||
| lxvd2x vs9, o16, T3 | |||||
| lxvd2x vs10, o32, T3 | |||||
| lxvd2x vs11, o48, T3 | |||||
| lxvd2x vs12, 0, T4 | |||||
| lxvd2x vs13, o16, T4 | |||||
| lxvd2x vs14, o32, T4 | |||||
| lxvd2x vs15, o48, T4 | |||||
| #endif | |||||
| add T2, T1, LDC | |||||
| add T3, T2, LDC | |||||
| add T4, T3, LDC | |||||
| lxvd2x vs0, 0, CO | |||||
| lxvd2x vs1, o16, CO | |||||
| lxvd2x vs2, o32, CO | |||||
| lxvd2x vs3, o48, CO | |||||
| lxvd2x vs4, o64, CO | |||||
| lxvd2x vs5, o80, CO | |||||
| lxvd2x vs6, o96, CO | |||||
| lxvd2x vs7, o112, CO | |||||
| lxvd2x vs8, 0, T2 | |||||
| lxvd2x vs9, o16, T2 | |||||
| lxvd2x vs10, o32, T2 | |||||
| lxvd2x vs11, o48, T2 | |||||
| lxvd2x vs12, o64, T2 | |||||
| lxvd2x vs13, o80, T2 | |||||
| lxvd2x vs14, o96, T2 | |||||
| lxvd2x vs15, o112, T2 | |||||
| lxvd2x vs24, 0, T3 | |||||
| lxvd2x vs25, o16, T3 | |||||
| lxvd2x vs26, o32, T3 | |||||
| lxvd2x vs27, o48, T3 | |||||
| lxvd2x vs28, o64, T3 | |||||
| lxvd2x vs29, o80, T3 | |||||
| lxvd2x vs30, o96, T3 | |||||
| lxvd2x vs31, o112, T3 | |||||
| #ifndef TRMMKERNEL | |||||
| xvmaddadp vs0, vs32, alpha_r | xvmaddadp vs0, vs32, alpha_r | ||||
| xvmaddadp vs1, vs33, alpha_r | xvmaddadp vs1, vs33, alpha_r | ||||
| xvmaddadp vs2, vs34, alpha_r | xvmaddadp vs2, vs34, alpha_r | ||||
| @@ -465,138 +599,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvmaddadp vs5, vs37, alpha_r | xvmaddadp vs5, vs37, alpha_r | ||||
| xvmaddadp vs6, vs38, alpha_r | xvmaddadp vs6, vs38, alpha_r | ||||
| xvmaddadp vs7, vs39, alpha_r | xvmaddadp vs7, vs39, alpha_r | ||||
| lxvd2x vs32, 0, T4 | |||||
| lxvd2x vs33, o16, T4 | |||||
| lxvd2x vs34, o32, T4 | |||||
| lxvd2x vs35, o48, T4 | |||||
| lxvd2x vs36, o64, T4 | |||||
| lxvd2x vs37, o80, T4 | |||||
| lxvd2x vs38, o96, T4 | |||||
| lxvd2x vs39, o112, T4 | |||||
| xvmaddadp vs8, vs40, alpha_r | xvmaddadp vs8, vs40, alpha_r | ||||
| xvmaddadp vs9, vs41, alpha_r | xvmaddadp vs9, vs41, alpha_r | ||||
| xvmaddadp vs10, vs42, alpha_r | xvmaddadp vs10, vs42, alpha_r | ||||
| xvmaddadp vs11, vs43, alpha_r | xvmaddadp vs11, vs43, alpha_r | ||||
| xvmaddadp vs12, vs44, alpha_r | |||||
| xvmaddadp vs13, vs45, alpha_r | |||||
| xvmaddadp vs14, vs46, alpha_r | |||||
| xvmaddadp vs15, vs47, alpha_r | |||||
| #else | |||||
| xvmuldp vs0, vs32, alpha_r | |||||
| xvmuldp vs1, vs33, alpha_r | |||||
| xvmuldp vs2, vs34, alpha_r | |||||
| xvmuldp vs3, vs35, alpha_r | |||||
| xvmuldp vs4, vs36, alpha_r | |||||
| xvmuldp vs5, vs37, alpha_r | |||||
| xvmuldp vs6, vs38, alpha_r | |||||
| xvmuldp vs7, vs39, alpha_r | |||||
| xvmuldp vs8, vs40, alpha_r | |||||
| xvmuldp vs9, vs41, alpha_r | |||||
| xvmuldp vs10, vs42, alpha_r | |||||
| xvmuldp vs11, vs43, alpha_r | |||||
| xvmuldp vs12, vs44, alpha_r | |||||
| xvmuldp vs13, vs45, alpha_r | |||||
| xvmuldp vs14, vs46, alpha_r | |||||
| xvmuldp vs15, vs47, alpha_r | |||||
| #endif | |||||
| stxvd2x vs0, 0, T1 | stxvd2x vs0, 0, T1 | ||||
| stxvd2x vs1, o16, T1 | stxvd2x vs1, o16, T1 | ||||
| stxvd2x vs2, o32, T1 | stxvd2x vs2, o32, T1 | ||||
| stxvd2x vs3, o48, T1 | stxvd2x vs3, o48, T1 | ||||
| stxvd2x vs4, 0, T2 | |||||
| stxvd2x vs5, o16, T2 | |||||
| stxvd2x vs6, o32, T2 | |||||
| stxvd2x vs7, o48, T2 | |||||
| stxvd2x vs8, 0, T3 | |||||
| stxvd2x vs9, o16, T3 | |||||
| stxvd2x vs10, o32, T3 | |||||
| stxvd2x vs11, o48, T3 | |||||
| stxvd2x vs12, 0, T4 | |||||
| stxvd2x vs13, o16, T4 | |||||
| stxvd2x vs14, o32, T4 | |||||
| stxvd2x vs15, o48, T4 | |||||
| slwi T4, LDC, 1 | |||||
| add T1, T1, T4 | |||||
| add T3, T3, T4 | |||||
| addi T2, T1, 64 | |||||
| addi T4, T3, 64 | |||||
| #ifndef TRMMKERNEL | |||||
| lxvd2x vs0, 0, T1 | |||||
| lxvd2x vs1, o16, T1 | |||||
| lxvd2x vs2, o32, T1 | |||||
| lxvd2x vs3, o48, T1 | |||||
| lxvd2x vs4, 0, T2 | |||||
| lxvd2x vs5, o16, T2 | |||||
| lxvd2x vs6, o32, T2 | |||||
| lxvd2x vs7, o48, T2 | |||||
| lxvd2x vs8, 0, T3 | |||||
| lxvd2x vs9, o16, T3 | |||||
| lxvd2x vs10, o32, T3 | |||||
| lxvd2x vs11, o48, T3 | |||||
| lxvd2x vs12, 0, T4 | |||||
| lxvd2x vs13, o16, T4 | |||||
| lxvd2x vs14, o32, T4 | |||||
| lxvd2x vs15, o48, T4 | |||||
| #endif | |||||
| #ifndef TRMMKERNEL | |||||
| xvmaddadp vs0, vs48, alpha_r | |||||
| xvmaddadp vs1, vs49, alpha_r | |||||
| xvmaddadp vs2, vs50, alpha_r | |||||
| xvmaddadp vs3, vs51, alpha_r | |||||
| xvmaddadp vs4, vs52, alpha_r | |||||
| xvmaddadp vs5, vs53, alpha_r | |||||
| xvmaddadp vs6, vs54, alpha_r | |||||
| xvmaddadp vs7, vs55, alpha_r | |||||
| xvmaddadp vs8, vs56, alpha_r | |||||
| xvmaddadp vs9, vs57, alpha_r | |||||
| xvmaddadp vs10, vs58, alpha_r | |||||
| xvmaddadp vs11, vs59, alpha_r | |||||
| xvmaddadp vs12, vs60, alpha_r | |||||
| xvmaddadp vs13, vs61, alpha_r | |||||
| xvmaddadp vs14, vs62, alpha_r | |||||
| xvmaddadp vs15, vs63, alpha_r | |||||
| #else | |||||
| xvmuldp vs0, vs48, alpha_r | |||||
| xvmuldp vs1, vs49, alpha_r | |||||
| xvmuldp vs2, vs50, alpha_r | |||||
| xvmuldp vs3, vs51, alpha_r | |||||
| xvmuldp vs4, vs52, alpha_r | |||||
| xvmuldp vs5, vs53, alpha_r | |||||
| xvmuldp vs6, vs54, alpha_r | |||||
| xvmuldp vs7, vs55, alpha_r | |||||
| xvmuldp vs8, vs56, alpha_r | |||||
| xvmuldp vs9, vs57, alpha_r | |||||
| xvmuldp vs10, vs58, alpha_r | |||||
| xvmuldp vs11, vs59, alpha_r | |||||
| xvmuldp vs12, vs60, alpha_r | |||||
| xvmuldp vs13, vs61, alpha_r | |||||
| xvmuldp vs14, vs62, alpha_r | |||||
| xvmuldp vs15, vs63, alpha_r | |||||
| #endif | |||||
| stxvd2x vs0, 0, T1 | |||||
| stxvd2x vs1, o16, T1 | |||||
| stxvd2x vs2, o32, T1 | |||||
| stxvd2x vs3, o48, T1 | |||||
| xvmaddadp vs12, vs44, alpha_r | |||||
| xvmaddadp vs13, vs45, alpha_r | |||||
| xvmaddadp vs14, vs46, alpha_r | |||||
| xvmaddadp vs15, vs47, alpha_r | |||||
| stxvd2x vs4, 0, T2 | |||||
| stxvd2x vs5, o16, T2 | |||||
| stxvd2x vs6, o32, T2 | |||||
| stxvd2x vs7, o48, T2 | |||||
| stxvd2x vs4, o64, T1 | |||||
| stxvd2x vs5, o80, T1 | |||||
| stxvd2x vs6, o96, T1 | |||||
| stxvd2x vs7, o112, T1 | |||||
| xvmaddadp vs24, vs48, alpha_r | |||||
| xvmaddadp vs25, vs49, alpha_r | |||||
| xvmaddadp vs26, vs50, alpha_r | |||||
| xvmaddadp vs27, vs51, alpha_r | |||||
| stxvd2x vs8, o0, T2 | |||||
| stxvd2x vs9, o16, T2 | |||||
| stxvd2x vs10, o32, T2 | |||||
| stxvd2x vs11, o48, T2 | |||||
| xvmaddadp vs28, vs52, alpha_r | |||||
| xvmaddadp vs29, vs53, alpha_r | |||||
| xvmaddadp vs30, vs54, alpha_r | |||||
| xvmaddadp vs31, vs55, alpha_r | |||||
| stxvd2x vs12, o64, T2 | |||||
| stxvd2x vs13, o80, T2 | |||||
| stxvd2x vs14, o96, T2 | |||||
| stxvd2x vs15, o112, T2 | |||||
| xvmaddadp vs32, vs56, alpha_r | |||||
| xvmaddadp vs33, vs57, alpha_r | |||||
| xvmaddadp vs34, vs58, alpha_r | |||||
| xvmaddadp vs35, vs59, alpha_r | |||||
| stxvd2x vs24, 0, T3 | |||||
| stxvd2x vs25, o16, T3 | |||||
| stxvd2x vs26, o32, T3 | |||||
| stxvd2x vs27, o48, T3 | |||||
| xvmaddadp vs36, vs60, alpha_r | |||||
| xvmaddadp vs37, vs61, alpha_r | |||||
| xvmaddadp vs38, vs62, alpha_r | |||||
| xvmaddadp vs39, vs63, alpha_r | |||||
| stxvd2x vs28, o64, T3 | |||||
| stxvd2x vs29, o80, T3 | |||||
| stxvd2x vs30, o96, T3 | |||||
| stxvd2x vs31, o112, T3 | |||||
| stxvd2x vs32, o0, T4 | |||||
| stxvd2x vs33, o16, T4 | |||||
| stxvd2x vs34, o32, T4 | |||||
| stxvd2x vs35, o48, T4 | |||||
| stxvd2x vs8, 0, T3 | |||||
| stxvd2x vs9, o16, T3 | |||||
| stxvd2x vs10, o32, T3 | |||||
| stxvd2x vs11, o48, T3 | |||||
| addi CO, CO, 128 | |||||
| stxvd2x vs12, 0, T4 | |||||
| stxvd2x vs13, o16, T4 | |||||
| stxvd2x vs14, o32, T4 | |||||
| stxvd2x vs15, o48, T4 | |||||
| stxvd2x vs36, o64, T4 | |||||
| stxvd2x vs37, o80, T4 | |||||
| stxvd2x vs38, o96, T4 | |||||
| stxvd2x vs39, o112, T4 | |||||
| addi CO, CO, 128 | |||||
| .endm | .endm | ||||
| @@ -0,0 +1,228 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/04/28 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "def_vsx.h" | |||||
| #define M r3 | |||||
| #define N r4 | |||||
| #define A r5 | |||||
| #define LDA r6 | |||||
| #define B r7 | |||||
| #define A0 r8 | |||||
| #define A1 r9 | |||||
| #define A2 r10 | |||||
| #define A3 r11 | |||||
| #define J r12 | |||||
| #define PREA r14 | |||||
| #define PREB r15 | |||||
| #define BO r16 | |||||
| #define o64 r17 | |||||
| #define o80 r18 | |||||
| #define o96 r19 | |||||
| #define o112 r20 | |||||
| #define o8 r21 | |||||
| #define T2 r22 | |||||
| #define I r23 | |||||
| #define o16 r24 | |||||
| #define o32 r25 | |||||
| #define o48 r26 | |||||
| #define NOTU1 r27 | |||||
| #define NOTU2 r30 | |||||
| #define T1 r31 | |||||
| #define o0 0 | |||||
| #include "dgemm_ncopy_macros_4_power8.S" | |||||
| #define STACKSIZE 384 | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| addi SP, SP, -STACKSIZE | |||||
| li r0, 0 | |||||
| stfd f14, 0(SP) | |||||
| stfd f15, 8(SP) | |||||
| stfd f16, 16(SP) | |||||
| stfd f17, 24(SP) | |||||
| stfd f18, 32(SP) | |||||
| stfd f19, 40(SP) | |||||
| stfd f20, 48(SP) | |||||
| stfd f21, 56(SP) | |||||
| stfd f22, 64(SP) | |||||
| stfd f23, 72(SP) | |||||
| stfd f24, 80(SP) | |||||
| stfd f25, 88(SP) | |||||
| stfd f26, 96(SP) | |||||
| stfd f27, 104(SP) | |||||
| stfd f28, 112(SP) | |||||
| stfd f29, 120(SP) | |||||
| stfd f30, 128(SP) | |||||
| stfd f31, 136(SP) | |||||
| std r31, 144(SP) | |||||
| std r30, 152(SP) | |||||
| std r29, 160(SP) | |||||
| std r28, 168(SP) | |||||
| std r27, 176(SP) | |||||
| std r26, 184(SP) | |||||
| std r25, 192(SP) | |||||
| std r24, 200(SP) | |||||
| std r23, 208(SP) | |||||
| std r22, 216(SP) | |||||
| std r21, 224(SP) | |||||
| std r20, 232(SP) | |||||
| std r19, 240(SP) | |||||
| std r18, 248(SP) | |||||
| std r17, 256(SP) | |||||
| std r16, 264(SP) | |||||
| std r15, 272(SP) | |||||
| std r14, 280(SP) | |||||
| cmpwi cr0, M, 0 | |||||
| ble- L999 | |||||
| cmpwi cr0, N, 0 | |||||
| ble- L999 | |||||
| slwi LDA, LDA, BASE_SHIFT | |||||
| li PREA, 384 | |||||
| li PREB, 384 | |||||
| li o8, 8 | |||||
| li o16, 16 | |||||
| li o32, 32 | |||||
| li o48, 48 | |||||
| li o64, 64 | |||||
| li o80, 80 | |||||
| li o96, 96 | |||||
| li o112, 112 | |||||
| #include "dgemm_ncopy_logic_4_power8.S" | |||||
| L999: | |||||
| li r3, 0 | |||||
| lfd f14, 0(SP) | |||||
| lfd f15, 8(SP) | |||||
| lfd f16, 16(SP) | |||||
| lfd f17, 24(SP) | |||||
| lfd f18, 32(SP) | |||||
| lfd f19, 40(SP) | |||||
| lfd f20, 48(SP) | |||||
| lfd f21, 56(SP) | |||||
| lfd f22, 64(SP) | |||||
| lfd f23, 72(SP) | |||||
| lfd f24, 80(SP) | |||||
| lfd f25, 88(SP) | |||||
| lfd f26, 96(SP) | |||||
| lfd f27, 104(SP) | |||||
| lfd f28, 112(SP) | |||||
| lfd f29, 120(SP) | |||||
| lfd f30, 128(SP) | |||||
| lfd f31, 136(SP) | |||||
| ld r31, 144(SP) | |||||
| ld r30, 152(SP) | |||||
| ld r29, 160(SP) | |||||
| ld r28, 168(SP) | |||||
| ld r27, 176(SP) | |||||
| ld r26, 184(SP) | |||||
| ld r25, 192(SP) | |||||
| ld r24, 200(SP) | |||||
| ld r23, 208(SP) | |||||
| ld r22, 216(SP) | |||||
| ld r21, 224(SP) | |||||
| ld r20, 232(SP) | |||||
| ld r19, 240(SP) | |||||
| ld r18, 248(SP) | |||||
| ld r17, 256(SP) | |||||
| ld r16, 264(SP) | |||||
| ld r15, 272(SP) | |||||
| ld r14, 280(SP) | |||||
| addi SP, SP, STACKSIZE | |||||
| blr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,237 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/04/28 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| mr BO, B | |||||
| srawi. I, N, 2 | |||||
| ble DCOPYN_L2_BEGIN | |||||
| DCOPYN_L4_BEGIN: | |||||
| DCOPYN_L4_LOOP: | |||||
| mr A0, A | |||||
| add A1, A0, LDA | |||||
| add A2, A1, LDA | |||||
| add A3, A2, LDA | |||||
| add A, A3, LDA | |||||
| DCOPYN_L4x16_BEGIN: | |||||
| srawi. J, M, 4 | |||||
| ble DCOPYN_L4x16_END | |||||
| DCOPYN_L4x16_LOOP: | |||||
| dcbt A0, PREA | |||||
| dcbt A1, PREA | |||||
| dcbt A2, PREA | |||||
| dcbt A3, PREA | |||||
| COPY_4x16 | |||||
| addic. J, J, -1 | |||||
| bgt DCOPYN_L4x16_LOOP | |||||
| DCOPYN_L4x16_END: | |||||
| DCOPYN_L4x8_BEGIN: | |||||
| andi. J, M, 8 | |||||
| ble DCOPYN_L4x8_END | |||||
| COPY_4x8 | |||||
| DCOPYN_L4x8_END: | |||||
| DCOPYN_L4x4_BEGIN: | |||||
| andi. J, M, 4 | |||||
| ble DCOPYN_L4x4_END | |||||
| COPY_4x4 | |||||
| DCOPYN_L4x4_END: | |||||
| DCOPYN_L4x2_BEGIN: | |||||
| andi. J, M, 2 | |||||
| ble DCOPYN_L4x2_END | |||||
| COPY_4x2 | |||||
| DCOPYN_L4x2_END: | |||||
| DCOPYN_L4x1_BEGIN: | |||||
| andi. J, M, 1 | |||||
| ble DCOPYN_L4x1_END | |||||
| COPY_4x1 | |||||
| DCOPYN_L4x1_END: | |||||
| DCOPYN_L4_END: | |||||
| addic. I, I, -1 | |||||
| bgt DCOPYN_L4_LOOP | |||||
| DCOPYN_L2_BEGIN: | |||||
| andi. T1, 4, 2 | |||||
| ble DCOPYN_L2_END | |||||
| DCOPYN_L2_LOOP: | |||||
| mr A0, A | |||||
| add A1, A0, LDA | |||||
| add A, A1, LDA | |||||
| DCOPYN_L2x16_BEGIN: | |||||
| srawi. J, M, 4 | |||||
| ble DCOPYN_L2x16_END | |||||
| DCOPYN_L2x16_LOOP: | |||||
| COPY_2x16 | |||||
| addic. J, J, -1 | |||||
| bgt DCOPYN_L2x16_LOOP | |||||
| DCOPYN_L2x16_END: | |||||
| DCOPYN_L2x8_BEGIN: | |||||
| andi. J, M, 8 | |||||
| ble DCOPYN_L2x8_END | |||||
| COPY_2x8 | |||||
| DCOPYN_L2x8_END: | |||||
| DCOPYN_L2x4_BEGIN: | |||||
| andi. J, M, 4 | |||||
| ble DCOPYN_L2x4_END | |||||
| COPY_2x4 | |||||
| DCOPYN_L2x4_END: | |||||
| DCOPYN_L2x2_BEGIN: | |||||
| andi. J, M, 2 | |||||
| ble DCOPYN_L2x2_END | |||||
| COPY_2x2 | |||||
| DCOPYN_L2x2_END: | |||||
| DCOPYN_L2x1_BEGIN: | |||||
| andi. J, M, 1 | |||||
| ble DCOPYN_L2x1_END | |||||
| COPY_2x1 | |||||
| DCOPYN_L2x1_END: | |||||
| DCOPYN_L2_END: | |||||
| DCOPYN_L1_BEGIN: | |||||
| andi. T1, 4, 1 | |||||
| ble DCOPYN_L1_END | |||||
| DCOPYN_L1_LOOP: | |||||
| mr A0, A | |||||
| add A, A0, LDA | |||||
| DCOPYN_L1x16_BEGIN: | |||||
| srawi. J, M, 4 | |||||
| ble DCOPYN_L1x16_END | |||||
| DCOPYN_L1x16_LOOP: | |||||
| COPY_1x16 | |||||
| addic. J, J, -1 | |||||
| bgt DCOPYN_L1x16_LOOP | |||||
| DCOPYN_L1x16_END: | |||||
| DCOPYN_L1x8_BEGIN: | |||||
| andi. J, M, 8 | |||||
| ble DCOPYN_L1x8_END | |||||
| COPY_1x8 | |||||
| DCOPYN_L1x8_END: | |||||
| DCOPYN_L1x4_BEGIN: | |||||
| andi. J, M, 4 | |||||
| ble DCOPYN_L1x4_END | |||||
| COPY_1x4 | |||||
| DCOPYN_L1x4_END: | |||||
| DCOPYN_L1x2_BEGIN: | |||||
| andi. J, M, 2 | |||||
| ble DCOPYN_L1x2_END | |||||
| COPY_1x2 | |||||
| DCOPYN_L1x2_END: | |||||
| DCOPYN_L1x1_BEGIN: | |||||
| andi. J, M, 1 | |||||
| ble DCOPYN_L1x1_END | |||||
| COPY_1x1 | |||||
| DCOPYN_L1x1_END: | |||||
| DCOPYN_L1_END: | |||||
| @@ -0,0 +1,691 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/04/28 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| /********************************************************************************************** | |||||
| * Macros for N=4 and M=16 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_4x16 | |||||
| lxvd2x vs0, o0, A0 | |||||
| lxvd2x vs8, o0, A1 | |||||
| lxvd2x vs24, o0, A3 | |||||
| lxvd2x vs16, o0, A2 | |||||
| lxvd2x vs1, o16, A0 | |||||
| lxvd2x vs9, o16, A1 | |||||
| lxvd2x vs17, o16, A2 | |||||
| lxvd2x vs25, o16, A3 | |||||
| lxvd2x vs2, o32, A0 | |||||
| lxvd2x vs10, o32, A1 | |||||
| lxvd2x vs18, o32, A2 | |||||
| lxvd2x vs26, o32, A3 | |||||
| lxvd2x vs3, o48, A0 | |||||
| lxvd2x vs11, o48, A1 | |||||
| lxvd2x vs19, o48, A2 | |||||
| lxvd2x vs27, o48, A3 | |||||
| lxvd2x vs4, o64, A0 | |||||
| lxvd2x vs12, o64, A1 | |||||
| lxvd2x vs20, o64, A2 | |||||
| lxvd2x vs28, o64, A3 | |||||
| lxvd2x vs5, o80, A0 | |||||
| lxvd2x vs13, o80, A1 | |||||
| lxvd2x vs21, o80, A2 | |||||
| lxvd2x vs29, o80, A3 | |||||
| lxvd2x vs6, o96, A0 | |||||
| lxvd2x vs14, o96, A1 | |||||
| lxvd2x vs22, o96, A2 | |||||
| lxvd2x vs30, o96, A3 | |||||
| lxvd2x vs7, o112, A0 | |||||
| lxvd2x vs15, o112, A1 | |||||
| lxvd2x vs23, o112, A2 | |||||
| lxvd2x vs31, o112, A3 | |||||
| xxpermdi vs32, vs0, vs8, 0 | |||||
| xxpermdi vs33, vs16, vs24, 0 | |||||
| xxpermdi vs34, vs0, vs8, 3 | |||||
| xxpermdi vs35, vs16, vs24, 3 | |||||
| xxpermdi vs36, vs1, vs9, 0 | |||||
| xxpermdi vs37, vs17, vs25, 0 | |||||
| xxpermdi vs38, vs1, vs9, 3 | |||||
| xxpermdi vs39, vs17, vs25, 3 | |||||
| xxpermdi vs40, vs2, vs10, 0 | |||||
| xxpermdi vs41, vs18, vs26, 0 | |||||
| xxpermdi vs42, vs2, vs10, 3 | |||||
| xxpermdi vs43, vs18, vs26, 3 | |||||
| xxpermdi vs44, vs3, vs11, 0 | |||||
| xxpermdi vs45, vs19, vs27, 0 | |||||
| xxpermdi vs46, vs3, vs11, 3 | |||||
| xxpermdi vs47, vs19, vs27, 3 | |||||
| xxpermdi vs48, vs4, vs12, 0 | |||||
| xxpermdi vs49, vs20, vs28, 0 | |||||
| xxpermdi vs50, vs4, vs12, 3 | |||||
| xxpermdi vs51, vs20, vs28, 3 | |||||
| xxpermdi vs52, vs5, vs13, 0 | |||||
| xxpermdi vs53, vs21, vs29, 0 | |||||
| xxpermdi vs54, vs5, vs13, 3 | |||||
| xxpermdi vs55, vs21, vs29, 3 | |||||
| addi A0, A0, 128 | |||||
| addi A1, A1, 128 | |||||
| xxpermdi vs56, vs6, vs14, 0 | |||||
| xxpermdi vs57, vs22, vs30, 0 | |||||
| xxpermdi vs58, vs6, vs14, 3 | |||||
| xxpermdi vs59, vs22, vs30, 3 | |||||
| addi A3, A3, 128 | |||||
| addi A2, A2, 128 | |||||
| xxpermdi vs60, vs7, vs15, 0 | |||||
| xxpermdi vs61, vs23, vs31, 0 | |||||
| xxpermdi vs62, vs7, vs15, 3 | |||||
| xxpermdi vs63, vs23, vs31, 3 | |||||
| stxvd2x vs32, o0, BO | |||||
| stxvd2x vs33, o16, BO | |||||
| stxvd2x vs34, o32, BO | |||||
| stxvd2x vs35, o48, BO | |||||
| stxvd2x vs36, o64, BO | |||||
| stxvd2x vs37, o80, BO | |||||
| stxvd2x vs38, o96, BO | |||||
| stxvd2x vs39, o112, BO | |||||
| addi BO, BO, 128 | |||||
| stxvd2x vs40, o0, BO | |||||
| stxvd2x vs41, o16, BO | |||||
| stxvd2x vs42, o32, BO | |||||
| stxvd2x vs43, o48, BO | |||||
| stxvd2x vs44, o64, BO | |||||
| stxvd2x vs45, o80, BO | |||||
| stxvd2x vs46, o96, BO | |||||
| stxvd2x vs47, o112, BO | |||||
| addi BO, BO, 128 | |||||
| stxvd2x vs48, o0, BO | |||||
| stxvd2x vs49, o16, BO | |||||
| stxvd2x vs50, o32, BO | |||||
| stxvd2x vs51, o48, BO | |||||
| stxvd2x vs52, o64, BO | |||||
| stxvd2x vs53, o80, BO | |||||
| stxvd2x vs54, o96, BO | |||||
| stxvd2x vs55, o112, BO | |||||
| addi BO, BO, 128 | |||||
| stxvd2x vs56, o0, BO | |||||
| stxvd2x vs57, o16, BO | |||||
| stxvd2x vs58, o32, BO | |||||
| stxvd2x vs59, o48, BO | |||||
| stxvd2x vs60, o64, BO | |||||
| stxvd2x vs61, o80, BO | |||||
| stxvd2x vs62, o96, BO | |||||
| stxvd2x vs63, o112, BO | |||||
| addi BO, BO, 128 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=4 and M=8 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_4x8 | |||||
| lxvd2x vs0, o0, A0 | |||||
| lxvd2x vs1, o16, A0 | |||||
| lxvd2x vs2, o32, A0 | |||||
| lxvd2x vs3, o48, A0 | |||||
| addi A0, A0, 64 | |||||
| lxvd2x vs8, o0, A1 | |||||
| lxvd2x vs9, o16, A1 | |||||
| lxvd2x vs10, o32, A1 | |||||
| lxvd2x vs11, o48, A1 | |||||
| addi A1, A1, 64 | |||||
| lxvd2x vs16, o0, A2 | |||||
| lxvd2x vs17, o16, A2 | |||||
| lxvd2x vs18, o32, A2 | |||||
| lxvd2x vs19, o48, A2 | |||||
| addi A2, A2, 64 | |||||
| lxvd2x vs24, o0, A3 | |||||
| lxvd2x vs25, o16, A3 | |||||
| lxvd2x vs26, o32, A3 | |||||
| lxvd2x vs27, o48, A3 | |||||
| addi A3, A3, 64 | |||||
| xxpermdi vs32, vs0, vs8, 0 | |||||
| xxpermdi vs33, vs16, vs24, 0 | |||||
| xxpermdi vs34, vs0, vs8, 3 | |||||
| xxpermdi vs35, vs16, vs24, 3 | |||||
| xxpermdi vs36, vs1, vs9, 0 | |||||
| xxpermdi vs37, vs17, vs25, 0 | |||||
| xxpermdi vs38, vs1, vs9, 3 | |||||
| xxpermdi vs39, vs17, vs25, 3 | |||||
| xxpermdi vs40, vs2, vs10, 0 | |||||
| xxpermdi vs41, vs18, vs26, 0 | |||||
| xxpermdi vs42, vs2, vs10, 3 | |||||
| xxpermdi vs43, vs18, vs26, 3 | |||||
| xxpermdi vs44, vs3, vs11, 0 | |||||
| xxpermdi vs45, vs19, vs27, 0 | |||||
| xxpermdi vs46, vs3, vs11, 3 | |||||
| xxpermdi vs47, vs19, vs27, 3 | |||||
| stxvd2x vs32, o0, BO | |||||
| stxvd2x vs33, o16, BO | |||||
| stxvd2x vs34, o32, BO | |||||
| stxvd2x vs35, o48, BO | |||||
| stxvd2x vs36, o64, BO | |||||
| stxvd2x vs37, o80, BO | |||||
| stxvd2x vs38, o96, BO | |||||
| stxvd2x vs39, o112, BO | |||||
| addi BO, BO, 128 | |||||
| stxvd2x vs40, o0, BO | |||||
| stxvd2x vs41, o16, BO | |||||
| stxvd2x vs42, o32, BO | |||||
| stxvd2x vs43, o48, BO | |||||
| stxvd2x vs44, o64, BO | |||||
| stxvd2x vs45, o80, BO | |||||
| stxvd2x vs46, o96, BO | |||||
| stxvd2x vs47, o112, BO | |||||
| addi BO, BO, 128 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=4 and M=4 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_4x4 | |||||
| lxvd2x vs0, o0, A0 | |||||
| lxvd2x vs1, o16, A0 | |||||
| addi A0, A0, 32 | |||||
| lxvd2x vs8, o0, A1 | |||||
| lxvd2x vs9, o16, A1 | |||||
| addi A1, A1, 32 | |||||
| lxvd2x vs16, o0, A2 | |||||
| lxvd2x vs17, o16, A2 | |||||
| addi A2, A2, 32 | |||||
| lxvd2x vs24, o0, A3 | |||||
| lxvd2x vs25, o16, A3 | |||||
| addi A3, A3, 32 | |||||
| xxpermdi vs32, vs0, vs8, 0 | |||||
| xxpermdi vs33, vs16, vs24, 0 | |||||
| xxpermdi vs34, vs0, vs8, 3 | |||||
| xxpermdi vs35, vs16, vs24, 3 | |||||
| xxpermdi vs36, vs1, vs9, 0 | |||||
| xxpermdi vs37, vs17, vs25, 0 | |||||
| xxpermdi vs38, vs1, vs9, 3 | |||||
| xxpermdi vs39, vs17, vs25, 3 | |||||
| stxvd2x vs32, o0, BO | |||||
| stxvd2x vs33, o16, BO | |||||
| stxvd2x vs34, o32, BO | |||||
| stxvd2x vs35, o48, BO | |||||
| stxvd2x vs36, o64, BO | |||||
| stxvd2x vs37, o80, BO | |||||
| stxvd2x vs38, o96, BO | |||||
| stxvd2x vs39, o112, BO | |||||
| addi BO, BO, 128 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=4 and M=2 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_4x2 | |||||
| lxvd2x vs0, o0, A0 | |||||
| addi A0, A0, 16 | |||||
| lxvd2x vs8, o0, A1 | |||||
| addi A1, A1, 16 | |||||
| lxvd2x vs16, o0, A2 | |||||
| addi A2, A2, 16 | |||||
| lxvd2x vs24, o0, A3 | |||||
| addi A3, A3, 16 | |||||
| xxpermdi vs32, vs0, vs8, 0 | |||||
| xxpermdi vs33, vs16, vs24, 0 | |||||
| xxpermdi vs34, vs0, vs8, 3 | |||||
| xxpermdi vs35, vs16, vs24, 3 | |||||
| stxvd2x vs32, o0, BO | |||||
| stxvd2x vs33, o16, BO | |||||
| stxvd2x vs34, o32, BO | |||||
| stxvd2x vs35, o48, BO | |||||
| addi BO, BO, 64 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=4 and M=1 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_4x1 | |||||
| lxsdx vs0, o0, A0 | |||||
| addi A0, A0, 8 | |||||
| lxsdx vs8, o0, A1 | |||||
| addi A1, A1, 8 | |||||
| lxsdx vs16, o0, A2 | |||||
| addi A2, A2, 8 | |||||
| lxsdx vs24, o0, A3 | |||||
| addi A3, A3, 8 | |||||
| xxpermdi vs32, vs0, vs8, 0 | |||||
| xxpermdi vs33, vs16, vs24, 0 | |||||
| stxvd2x vs32, o0, BO | |||||
| stxvd2x vs33, o16, BO | |||||
| addi BO, BO, 32 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=2 and M=16 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_2x16 | |||||
| lxvd2x vs0, o0, A0 | |||||
| lxvd2x vs1, o16, A0 | |||||
| lxvd2x vs2, o32, A0 | |||||
| lxvd2x vs3, o48, A0 | |||||
| lxvd2x vs4, o64, A0 | |||||
| lxvd2x vs5, o80, A0 | |||||
| lxvd2x vs6, o96, A0 | |||||
| lxvd2x vs7, o112, A0 | |||||
| addi A0, A0, 128 | |||||
| lxvd2x vs8, o0, A1 | |||||
| lxvd2x vs9, o16, A1 | |||||
| lxvd2x vs10, o32, A1 | |||||
| lxvd2x vs11, o48, A1 | |||||
| lxvd2x vs12, o64, A1 | |||||
| lxvd2x vs13, o80, A1 | |||||
| lxvd2x vs14, o96, A1 | |||||
| lxvd2x vs15, o112, A1 | |||||
| addi A1, A1, 128 | |||||
| xxpermdi vs32, vs0, vs8, 0 | |||||
| xxpermdi vs33, vs0, vs8, 3 | |||||
| xxpermdi vs34, vs1, vs9, 0 | |||||
| xxpermdi vs35, vs1, vs9, 3 | |||||
| xxpermdi vs36, vs2, vs10, 0 | |||||
| xxpermdi vs37, vs2, vs10, 3 | |||||
| xxpermdi vs38, vs3, vs11, 0 | |||||
| xxpermdi vs39, vs3, vs11, 3 | |||||
| xxpermdi vs40, vs4, vs12, 0 | |||||
| xxpermdi vs41, vs4, vs12, 3 | |||||
| xxpermdi vs42, vs5, vs13, 0 | |||||
| xxpermdi vs43, vs5, vs13, 3 | |||||
| xxpermdi vs44, vs6, vs14, 0 | |||||
| xxpermdi vs45, vs6, vs14, 3 | |||||
| xxpermdi vs46, vs7, vs15, 0 | |||||
| xxpermdi vs47, vs7, vs15, 3 | |||||
| stxvd2x vs32, o0, BO | |||||
| stxvd2x vs33, o16, BO | |||||
| stxvd2x vs34, o32, BO | |||||
| stxvd2x vs35, o48, BO | |||||
| stxvd2x vs36, o64, BO | |||||
| stxvd2x vs37, o80, BO | |||||
| stxvd2x vs38, o96, BO | |||||
| stxvd2x vs39, o112, BO | |||||
| addi BO, BO, 128 | |||||
| stxvd2x vs40, o0, BO | |||||
| stxvd2x vs41, o16, BO | |||||
| stxvd2x vs42, o32, BO | |||||
| stxvd2x vs43, o48, BO | |||||
| stxvd2x vs44, o64, BO | |||||
| stxvd2x vs45, o80, BO | |||||
| stxvd2x vs46, o96, BO | |||||
| stxvd2x vs47, o112, BO | |||||
| addi BO, BO, 128 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=2 and M=8 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_2x8 | |||||
| lxvd2x vs0, o0, A0 | |||||
| lxvd2x vs1, o16, A0 | |||||
| lxvd2x vs2, o32, A0 | |||||
| lxvd2x vs3, o48, A0 | |||||
| addi A0, A0, 64 | |||||
| lxvd2x vs8, o0, A1 | |||||
| lxvd2x vs9, o16, A1 | |||||
| lxvd2x vs10, o32, A1 | |||||
| lxvd2x vs11, o48, A1 | |||||
| addi A1, A1, 64 | |||||
| xxpermdi vs32, vs0, vs8, 0 | |||||
| xxpermdi vs33, vs0, vs8, 3 | |||||
| xxpermdi vs34, vs1, vs9, 0 | |||||
| xxpermdi vs35, vs1, vs9, 3 | |||||
| xxpermdi vs36, vs2, vs10, 0 | |||||
| xxpermdi vs37, vs2, vs10, 3 | |||||
| xxpermdi vs38, vs3, vs11, 0 | |||||
| xxpermdi vs39, vs3, vs11, 3 | |||||
| stxvd2x vs32, o0, BO | |||||
| stxvd2x vs33, o16, BO | |||||
| stxvd2x vs34, o32, BO | |||||
| stxvd2x vs35, o48, BO | |||||
| stxvd2x vs36, o64, BO | |||||
| stxvd2x vs37, o80, BO | |||||
| stxvd2x vs38, o96, BO | |||||
| stxvd2x vs39, o112, BO | |||||
| addi BO, BO, 128 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=2 and M=4 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_2x4 | |||||
| lxvd2x vs0, o0, A0 | |||||
| lxvd2x vs1, o16, A0 | |||||
| addi A0, A0, 32 | |||||
| lxvd2x vs8, o0, A1 | |||||
| lxvd2x vs9, o16, A1 | |||||
| addi A1, A1, 32 | |||||
| xxpermdi vs32, vs0, vs8, 0 | |||||
| xxpermdi vs33, vs0, vs8, 3 | |||||
| xxpermdi vs34, vs1, vs9, 0 | |||||
| xxpermdi vs35, vs1, vs9, 3 | |||||
| stxvd2x vs32, o0, BO | |||||
| stxvd2x vs33, o16, BO | |||||
| stxvd2x vs34, o32, BO | |||||
| stxvd2x vs35, o48, BO | |||||
| addi BO, BO, 64 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=2 and M=2 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_2x2 | |||||
| lxvd2x vs0, o0, A0 | |||||
| addi A0, A0, 16 | |||||
| lxvd2x vs8, o0, A1 | |||||
| addi A1, A1, 16 | |||||
| xxpermdi vs32, vs0, vs8, 0 | |||||
| xxpermdi vs33, vs0, vs8, 3 | |||||
| stxvd2x vs32, o0, BO | |||||
| stxvd2x vs33, o16, BO | |||||
| addi BO, BO, 32 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=2 and M=1 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_2x1 | |||||
| lxsdx vs0, o0, A0 | |||||
| addi A0, A0, 8 | |||||
| lxsdx vs8, o0, A1 | |||||
| addi A1, A1, 8 | |||||
| xxpermdi vs32, vs0, vs8, 0 | |||||
| stxvd2x vs32, o0, BO | |||||
| addi BO, BO, 16 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=1 and M=16 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_1x16 | |||||
| lxvd2x vs0, o0, A0 | |||||
| lxvd2x vs1, o16, A0 | |||||
| lxvd2x vs2, o32, A0 | |||||
| lxvd2x vs3, o48, A0 | |||||
| lxvd2x vs4, o64, A0 | |||||
| lxvd2x vs5, o80, A0 | |||||
| lxvd2x vs6, o96, A0 | |||||
| lxvd2x vs7, o112, A0 | |||||
| addi A0, A0, 128 | |||||
| stxvd2x vs0, o0, BO | |||||
| stxvd2x vs1, o16, BO | |||||
| stxvd2x vs2, o32, BO | |||||
| stxvd2x vs3, o48, BO | |||||
| addi BO, BO, 64 | |||||
| stxvd2x vs4, o0, BO | |||||
| stxvd2x vs5, o16, BO | |||||
| stxvd2x vs6, o32, BO | |||||
| stxvd2x vs7, o48, BO | |||||
| addi BO, BO, 64 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=1 and M=8 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_1x8 | |||||
| lxvd2x vs0, o0, A0 | |||||
| lxvd2x vs1, o16, A0 | |||||
| lxvd2x vs2, o32, A0 | |||||
| lxvd2x vs3, o48, A0 | |||||
| addi A0, A0, 64 | |||||
| stxvd2x vs0, o0, BO | |||||
| stxvd2x vs1, o16, BO | |||||
| stxvd2x vs2, o32, BO | |||||
| stxvd2x vs3, o48, BO | |||||
| addi BO, BO, 64 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=1 and M=4 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_1x4 | |||||
| lxvd2x vs0, o0, A0 | |||||
| lxvd2x vs1, o16, A0 | |||||
| addi A0, A0, 32 | |||||
| stxvd2x vs0, o0, BO | |||||
| stxvd2x vs1, o16, BO | |||||
| addi BO, BO, 32 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=1 and M=2 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_1x2 | |||||
| lxvd2x vs0, o0, A0 | |||||
| addi A0, A0, 16 | |||||
| stxvd2x vs0, o0, BO | |||||
| addi BO, BO, 16 | |||||
| .endm | |||||
| /********************************************************************************************** | |||||
| * Macros for N=1 and M=1 | |||||
| **********************************************************************************************/ | |||||
| .macro COPY_1x1 | |||||
| lxsdx vs0, o0, A0 | |||||
| addi A0, A0, 8 | |||||
| stxsdx vs0, o0, BO | |||||
| addi BO, BO, 8 | |||||
| .endm | |||||