|
-
- #if defined(_AIX)
- define(`INIT_16x4', `
- #else
- .macro INIT_16x4
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
- XVMOVDP(vs34,vs0)
- XVMOVDP(vs35,vs0)
- XVMOVDP(vs36,vs0)
- XVMOVDP(vs37,vs0)
- XVMOVDP(vs38,vs0)
- XVMOVDP(vs39,vs0)
- XVMOVDP(vs40,vs0)
- XVMOVDP(vs41,vs0)
- XVMOVDP(vs42,vs0)
- XVMOVDP(vs43,vs0)
- XVMOVDP(vs44,vs0)
- XVMOVDP(vs45,vs0)
- XVMOVDP(vs46,vs0)
- XVMOVDP(vs47,vs0)
- XVMOVDP(vs48,vs0)
- XVMOVDP(vs49,vs0)
- XVMOVDP(vs50,vs0)
- XVMOVDP(vs51,vs0)
- XVMOVDP(vs52,vs0)
- XVMOVDP(vs53,vs0)
- XVMOVDP(vs54,vs0)
- XVMOVDP(vs55,vs0)
- XVMOVDP(vs56,vs0)
- XVMOVDP(vs57,vs0)
- XVMOVDP(vs58,vs0)
- XVMOVDP(vs59,vs0)
- XVMOVDP(vs60,vs0)
- XVMOVDP(vs61,vs0)
- XVMOVDP(vs62,vs0)
- XVMOVDP(vs63,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_16x4', `
- #else
- .macro KERNEL_16x4
- #endif
-
-
- lxvd2x vs0, o0, AO
-
- lxvdsx vs16, o0, BO
- lxvdsx vs17, o8, BO
- lxvdsx vs18, o16, BO
- lxvdsx vs19, o24, BO
-
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- addi BO, BO, 32
- addi AO, AO, 64
-
- lxvd2x vs4, o0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- addi AO, AO, 64
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs0, vs17
- xvmaddadp vs34, vs0, vs18
- xvmaddadp vs35, vs0, vs19
- xvmaddadp vs36, vs1, vs16
- xvmaddadp vs37, vs1, vs17
- xvmaddadp vs38, vs1, vs18
- xvmaddadp vs39, vs1, vs19
- xvmaddadp vs40, vs2, vs16
- xvmaddadp vs41, vs2, vs17
- xvmaddadp vs42, vs2, vs18
- xvmaddadp vs43, vs2, vs19
- xvmaddadp vs44, vs3, vs16
- xvmaddadp vs45, vs3, vs17
- xvmaddadp vs46, vs3, vs18
- xvmaddadp vs47, vs3, vs19
- xvmaddadp vs48, vs4, vs16
- xvmaddadp vs49, vs4, vs17
- xvmaddadp vs50, vs4, vs18
- xvmaddadp vs51, vs4, vs19
- xvmaddadp vs52, vs5, vs16
- xvmaddadp vs53, vs5, vs17
- xvmaddadp vs54, vs5, vs18
- xvmaddadp vs55, vs5, vs19
- xvmaddadp vs56, vs6, vs16
- xvmaddadp vs57, vs6, vs17
- xvmaddadp vs58, vs6, vs18
- xvmaddadp vs59, vs6, vs19
- xvmaddadp vs60, vs7, vs16
- xvmaddadp vs61, vs7, vs17
- xvmaddadp vs62, vs7, vs18
- xvmaddadp vs63, vs7, vs19
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_8x4', `
- #else
- .macro INIT_8x4
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
- XVMOVDP(vs34,vs0)
- XVMOVDP(vs35,vs0)
- XVMOVDP(vs36,vs0)
- XVMOVDP(vs37,vs0)
- XVMOVDP(vs38,vs0)
- XVMOVDP(vs39,vs0)
- XVMOVDP(vs40,vs0)
- XVMOVDP(vs41,vs0)
- XVMOVDP(vs42,vs0)
- XVMOVDP(vs43,vs0)
- XVMOVDP(vs44,vs0)
- XVMOVDP(vs45,vs0)
- XVMOVDP(vs46,vs0)
- XVMOVDP(vs47,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_8x4', `
- #else
- .macro KERNEL_8x4
- #endif
-
-
- lxvd2x vs0, o0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- addi AO, AO, 64
-
- lxvdsx vs16, o0, BO
- lxvdsx vs17, o8, BO
- lxvdsx vs18, o16, BO
- lxvdsx vs19, o24, BO
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs0, vs17
- xvmaddadp vs34, vs0, vs18
- xvmaddadp vs35, vs0, vs19
- xvmaddadp vs36, vs1, vs16
- xvmaddadp vs37, vs1, vs17
- xvmaddadp vs38, vs1, vs18
- xvmaddadp vs39, vs1, vs19
- xvmaddadp vs40, vs2, vs16
- xvmaddadp vs41, vs2, vs17
- xvmaddadp vs42, vs2, vs18
- xvmaddadp vs43, vs2, vs19
- xvmaddadp vs44, vs3, vs16
- xvmaddadp vs45, vs3, vs17
- xvmaddadp vs46, vs3, vs18
- xvmaddadp vs47, vs3, vs19
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_4x4', `
- #else
- .macro INIT_4x4
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
- XVMOVDP(vs34,vs0)
- XVMOVDP(vs35,vs0)
- XVMOVDP(vs36,vs0)
- XVMOVDP(vs37,vs0)
- XVMOVDP(vs38,vs0)
- XVMOVDP(vs39,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_4x4', `
- #else
- .macro KERNEL_4x4
- #endif
-
-
- lxvd2x vs0, o0, AO
- lxvd2x vs1, o16, AO
-
- addi AO, AO, 32
-
- lxvdsx vs16, o0, BO
- lxvdsx vs17, o8, BO
- lxvdsx vs18, o16, BO
- lxvdsx vs19, o24, BO
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs0, vs17
- xvmaddadp vs34, vs0, vs18
- xvmaddadp vs35, vs0, vs19
- xvmaddadp vs36, vs1, vs16
- xvmaddadp vs37, vs1, vs17
- xvmaddadp vs38, vs1, vs18
- xvmaddadp vs39, vs1, vs19
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_2x4', `
- #else
- .macro INIT_2x4
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
- XVMOVDP(vs34,vs0)
- XVMOVDP(vs35,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_2x4', `
- #else
- .macro KERNEL_2x4
- #endif
-
-
- lxvd2x vs0, o0, AO
-
- addi AO, AO, 16
-
- lxvdsx vs16, o0, BO
- lxvdsx vs17, o8, BO
- lxvdsx vs18, o16, BO
- lxvdsx vs19, o24, BO
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs0, vs17
- xvmaddadp vs34, vs0, vs18
- xvmaddadp vs35, vs0, vs19
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_1x4', `
- #else
- .macro INIT_1x4
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
- XVMOVDP(vs34,vs0)
- XVMOVDP(vs35,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_1x4', `
- #else
- .macro KERNEL_1x4
- #endif
-
-
- lxvdsx vs0, o0, AO
-
- addi AO, AO, 8
-
- lxvdsx vs16, o0, BO
- lxvdsx vs17, o8, BO
- lxvdsx vs18, o16, BO
- lxvdsx vs19, o24, BO
-
- addi BO, BO, 32
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs0, vs17
- xvmaddadp vs34, vs0, vs18
- xvmaddadp vs35, vs0, vs19
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 16x4
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_16x4', `
- #else
- .macro SOLVE_LT_16x4
- #endif
-
- //############### LOAD B #######################
-
- mr T1, BO
- mr T4, BO
-
- xxpermdi vs0, vs32, vs33, 0
- xxpermdi vs1, vs34, vs35, 0
- xxpermdi vs2, vs32, vs33, 3
- xxpermdi vs3, vs34, vs35, 3
-
- lxvd2x vs32, o0, T1
- lxvd2x vs33, o16, T1
- lxvd2x vs34, o32, T1
- lxvd2x vs35, o48, T1
-
- addi T1, T1, 64
-
- xxpermdi vs4, vs36, vs37, 0
- xxpermdi vs5, vs38, vs39, 0
- xxpermdi vs6, vs36, vs37, 3
- xxpermdi vs7, vs38, vs39, 3
-
- lxvd2x vs36, o0, T1
- lxvd2x vs37, o16, T1
- lxvd2x vs38, o32, T1
- lxvd2x vs39, o48, T1
-
- addi T1, T1, 64
-
- xxpermdi vs8, vs40, vs41, 0
- xxpermdi vs9, vs42, vs43, 0
- xxpermdi vs10, vs40, vs41, 3
- xxpermdi vs11, vs42, vs43, 3
-
- lxvd2x vs40, o0, T1
- lxvd2x vs41, o16, T1
- lxvd2x vs42, o32, T1
- lxvd2x vs43, o48, T1
-
- addi T1, T1, 64
-
- xxpermdi vs12, vs44, vs45, 0
- xxpermdi vs13, vs46, vs47, 0
- xxpermdi vs14, vs44, vs45, 3
- xxpermdi vs15, vs46, vs47, 3
-
- lxvd2x vs44, o0, T1
- lxvd2x vs45, o16, T1
- lxvd2x vs46, o32, T1
- lxvd2x vs47, o48, T1
-
- addi T1, T1, 64
-
- xxpermdi vs16, vs48, vs49, 0
- xxpermdi vs17, vs50, vs51, 0
- xxpermdi vs18, vs48, vs49, 3
- xxpermdi vs19, vs50, vs51, 3
-
- lxvd2x vs48, o0, T1
- lxvd2x vs49, o16, T1
- lxvd2x vs50, o32, T1
- lxvd2x vs51, o48, T1
-
- addi T1, T1, 64
-
- xxpermdi vs20, vs52, vs53, 0
- xxpermdi vs21, vs54, vs55, 0
- xxpermdi vs22, vs52, vs53, 3
- xxpermdi vs23, vs54, vs55, 3
-
- lxvd2x vs52, o0, T1
- lxvd2x vs53, o16, T1
- lxvd2x vs54, o32, T1
- lxvd2x vs55, o48, T1
-
- addi T1, T1, 64
-
- xxpermdi vs24, vs56, vs57, 0
- xxpermdi vs25, vs58, vs59, 0
- xxpermdi vs26, vs56, vs57, 3
- xxpermdi vs27, vs58, vs59, 3
-
- lxvd2x vs56, o0, T1
- lxvd2x vs57, o16, T1
- lxvd2x vs58, o32, T1
- lxvd2x vs59, o48, T1
-
- addi T1, T1, 64
-
- xxpermdi vs28, vs60, vs61, 0
- xxpermdi vs29, vs62, vs63, 0
- xxpermdi vs30, vs60, vs61, 3
- xxpermdi vs31, vs62, vs63, 3
-
-
-
- lxvd2x vs60, o0, T1
- lxvd2x vs61, o16, T1
- lxvd2x vs62, o32, T1
- lxvd2x vs63, o48, T1
-
- //############### OFFSET 0 #######################
-
- dcbt AO, PRE
- mr T1, AO
-
- xvsubdp vs32, vs32, vs0
- xvsubdp vs33, vs33, vs1
- xvsubdp vs34, vs34, vs2
- xvsubdp vs35, vs35, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvsubdp vs36, vs36, vs4
- xvsubdp vs37, vs37, vs5
- xvsubdp vs38, vs38, vs6
- xvsubdp vs39, vs39, vs7
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- xvsubdp vs40, vs40, vs8
- xvsubdp vs41, vs41, vs9
- xvsubdp vs42, vs42, vs10
- xvsubdp vs43, vs43, vs11
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
- lxvdsx vs10, o16, T1
- lxvdsx vs11, o24, T1
-
- addi T1, T1, 32
-
- xvsubdp vs44, vs44, vs12
- xvsubdp vs45, vs45, vs13
- xvsubdp vs46, vs46, vs14
- xvsubdp vs47, vs47, vs15
-
- lxvdsx vs12, o0, T1
- lxvdsx vs13, o8, T1
- lxvdsx vs14, o16, T1
- lxvdsx vs15, o24, T1
-
- addi T1, T1, 32
-
- xvsubdp vs48, vs48, vs16
- xvsubdp vs49, vs49, vs17
- xvsubdp vs50, vs50, vs18
- xvsubdp vs51, vs51, vs19
-
- xvsubdp vs52, vs52, vs20
- xvsubdp vs53, vs53, vs21
- xvsubdp vs54, vs54, vs22
- xvsubdp vs55, vs55, vs23
-
- xvsubdp vs56, vs56, vs24
- xvsubdp vs57, vs57, vs25
- xvsubdp vs58, vs58, vs26
- xvsubdp vs59, vs59, vs27
-
- xvsubdp vs60, vs60, vs28
- xvsubdp vs61, vs61, vs29
- xvsubdp vs62, vs62, vs30
- xvsubdp vs63, vs63, vs31
-
- //############### OFFSET 1 #######################
-
- addi T1, T1, 1*SIZE
-
- xvmuldp vs32, vs32, vs0
- xvmuldp vs33, vs33, vs0
-
- xvnmsubadp vs34, vs32, vs1
- xvnmsubadp vs35, vs33, vs1
- xvnmsubadp vs36, vs32, vs2
- dcbt T1, PRE
- xvnmsubadp vs37, vs33, vs2
- xvnmsubadp vs38, vs32, vs3
- xvnmsubadp vs39, vs33, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs40, vs32, vs4
- xvnmsubadp vs41, vs33, vs4
- xvnmsubadp vs42, vs32, vs5
- xvnmsubadp vs43, vs33, vs5
- xvnmsubadp vs44, vs32, vs6
- xvnmsubadp vs45, vs33, vs6
- xvnmsubadp vs46, vs32, vs7
- xvnmsubadp vs47, vs33, vs7
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs48, vs32, vs8
- xvnmsubadp vs49, vs33, vs8
- xvnmsubadp vs50, vs32, vs9
- xvnmsubadp vs51, vs33, vs9
- xvnmsubadp vs52, vs32, vs10
- xvnmsubadp vs53, vs33, vs10
- xvnmsubadp vs54, vs32, vs11
- xvnmsubadp vs55, vs33, vs11
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
- lxvdsx vs10, o16, T1
- lxvdsx vs11, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs56, vs32, vs12
- xvnmsubadp vs57, vs33, vs12
- xvnmsubadp vs58, vs32, vs13
- xvnmsubadp vs59, vs33, vs13
- xvnmsubadp vs60, vs32, vs14
- xvnmsubadp vs61, vs33, vs14
- xvnmsubadp vs62, vs32, vs15
- xvnmsubadp vs63, vs33, vs15
-
-
- lxvdsx vs12, o0, T1
- lxvdsx vs13, o8, T1
- lxvdsx vs14, o16, T1
-
- addi T1, T1, 24
-
- //############### OFFSET 2 #######################
-
- xvmuldp vs34, vs34, vs0
- xvmuldp vs35, vs35, vs0
-
- addi T1, T1, 2*SIZE
-
- xvnmsubadp vs36, vs34, vs1
- xvnmsubadp vs37, vs35, vs1
- xvnmsubadp vs38, vs34, vs2
- dcbt T1, PRE
- xvnmsubadp vs39, vs35, vs2
- xvnmsubadp vs40, vs34, vs3
- xvnmsubadp vs41, vs35, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs42, vs34, vs4
- xvnmsubadp vs43, vs35, vs4
- xvnmsubadp vs44, vs34, vs5
- xvnmsubadp vs45, vs35, vs5
- xvnmsubadp vs46, vs34, vs6
- xvnmsubadp vs47, vs35, vs6
- xvnmsubadp vs48, vs34, vs7
- xvnmsubadp vs49, vs35, vs7
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs50, vs34, vs8
- xvnmsubadp vs51, vs35, vs8
- xvnmsubadp vs52, vs34, vs9
- xvnmsubadp vs53, vs35, vs9
- xvnmsubadp vs54, vs34, vs10
- xvnmsubadp vs55, vs35, vs10
- xvnmsubadp vs56, vs34, vs11
- xvnmsubadp vs57, vs35, vs11
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
- lxvdsx vs10, o16, T1
- lxvdsx vs11, o24, T1
-
- addi T1, T1, 32
-
-
- xvnmsubadp vs58, vs34, vs12
- xvnmsubadp vs59, vs35, vs12
- xvnmsubadp vs60, vs34, vs13
- xvnmsubadp vs61, vs35, vs13
- xvnmsubadp vs62, vs34, vs14
- xvnmsubadp vs63, vs35, vs14
-
- lxvdsx vs12, o0, T1
- lxvdsx vs13, o8, T1
-
- addi T1, T1, 16
-
- //############### OFFSET 3 #######################
- xvmuldp vs36, vs36, vs0
- xvmuldp vs37, vs37, vs0
-
- addi T1, T1, 3*SIZE
-
- xvnmsubadp vs38, vs36, vs1
- xvnmsubadp vs39, vs37, vs1
- xvnmsubadp vs40, vs36, vs2
- dcbt T1, PRE
- xvnmsubadp vs41, vs37, vs2
- xvnmsubadp vs42, vs36, vs3
- xvnmsubadp vs43, vs37, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs44, vs36, vs4
- xvnmsubadp vs45, vs37, vs4
- xvnmsubadp vs46, vs36, vs5
- xvnmsubadp vs47, vs37, vs5
- xvnmsubadp vs48, vs36, vs6
- xvnmsubadp vs49, vs37, vs6
- xvnmsubadp vs50, vs36, vs7
- xvnmsubadp vs51, vs37, vs7
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs52, vs36, vs8
- xvnmsubadp vs53, vs37, vs8
- xvnmsubadp vs54, vs36, vs9
- xvnmsubadp vs55, vs37, vs9
- xvnmsubadp vs56, vs36, vs10
- xvnmsubadp vs57, vs37, vs10
- xvnmsubadp vs58, vs36, vs11
- xvnmsubadp vs59, vs37, vs11
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
- lxvdsx vs10, o16, T1
- lxvdsx vs11, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs60, vs36, vs12
- xvnmsubadp vs61, vs37, vs12
- xvnmsubadp vs62, vs36, vs13
- xvnmsubadp vs63, vs37, vs13
-
- lxvdsx vs12, o0, T1
-
- stxvd2x vs32, o0, T4
- stxvd2x vs33, o16, T4
- stxvd2x vs34, o32, T4
- stxvd2x vs35, o48, T4
-
- addi T4, T4, 64
-
- addi T1, T1, 8
-
- //############### OFFSET 4 #######################
- xvmuldp vs38, vs38, vs0
- xvmuldp vs39, vs39, vs0
-
- addi T1, T1, 4*SIZE
-
- xvnmsubadp vs40, vs38, vs1
- xvnmsubadp vs41, vs39, vs1
- xvnmsubadp vs42, vs38, vs2
- dcbt T1, PRE
- xvnmsubadp vs43, vs39, vs2
- xvnmsubadp vs44, vs38, vs3
- xvnmsubadp vs45, vs39, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs46, vs38, vs4
- xvnmsubadp vs47, vs39, vs4
- xvnmsubadp vs48, vs38, vs5
- xvnmsubadp vs49, vs39, vs5
- xvnmsubadp vs50, vs38, vs6
- xvnmsubadp vs51, vs39, vs6
- xvnmsubadp vs52, vs38, vs7
- xvnmsubadp vs53, vs39, vs7
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
-
- xvnmsubadp vs54, vs38, vs8
- xvnmsubadp vs55, vs39, vs8
- xvnmsubadp vs56, vs38, vs9
- xvnmsubadp vs57, vs39, vs9
- xvnmsubadp vs58, vs38, vs10
- xvnmsubadp vs59, vs39, vs10
- xvnmsubadp vs60, vs38, vs11
- xvnmsubadp vs61, vs39, vs11
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
- lxvdsx vs10, o16, T1
- lxvdsx vs11, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs62, vs38, vs12
- xvnmsubadp vs63, vs39, vs12
-
-
- //############### OFFSET 5 #######################
- xvmuldp vs40, vs40, vs0
- xvmuldp vs41, vs41, vs0
-
- addi T1, T1, 5*SIZE
-
- xvnmsubadp vs42, vs40, vs1
- xvnmsubadp vs43, vs41, vs1
- xvnmsubadp vs44, vs40, vs2
- dcbt T1, PRE
- xvnmsubadp vs45, vs41, vs2
- xvnmsubadp vs46, vs40, vs3
- xvnmsubadp vs47, vs41, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs48, vs40, vs4
- xvnmsubadp vs49, vs41, vs4
- xvnmsubadp vs50, vs40, vs5
- xvnmsubadp vs51, vs41, vs5
- xvnmsubadp vs52, vs40, vs6
- xvnmsubadp vs53, vs41, vs6
- xvnmsubadp vs54, vs40, vs7
- xvnmsubadp vs55, vs41, vs7
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs56, vs40, vs8
- xvnmsubadp vs57, vs41, vs8
- xvnmsubadp vs58, vs40, vs9
- xvnmsubadp vs59, vs41, vs9
- xvnmsubadp vs60, vs40, vs10
- xvnmsubadp vs61, vs41, vs10
- xvnmsubadp vs62, vs40, vs11
- xvnmsubadp vs63, vs41, vs11
-
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
- lxvdsx vs10, o16, T1
-
- addi T1, T1, 24
-
- //############### OFFSET 6 #######################
- xvmuldp vs42, vs42, vs0
- xvmuldp vs43, vs43, vs0
-
- addi T1, T1, 6*SIZE
-
- xvnmsubadp vs44, vs42, vs1
- xvnmsubadp vs45, vs43, vs1
- xvnmsubadp vs46, vs42, vs2
- dcbt T1, PRE
- xvnmsubadp vs47, vs43, vs2
- xvnmsubadp vs48, vs42, vs3
- xvnmsubadp vs49, vs43, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs50, vs42, vs4
- xvnmsubadp vs51, vs43, vs4
- xvnmsubadp vs52, vs42, vs5
- xvnmsubadp vs53, vs43, vs5
- xvnmsubadp vs54, vs42, vs6
- xvnmsubadp vs55, vs43, vs6
- xvnmsubadp vs56, vs42, vs7
- xvnmsubadp vs57, vs43, vs7
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs58, vs42, vs8
- xvnmsubadp vs59, vs43, vs8
- xvnmsubadp vs60, vs42, vs9
- xvnmsubadp vs61, vs43, vs9
- xvnmsubadp vs62, vs42, vs10
- xvnmsubadp vs63, vs43, vs10
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
-
- addi T1, T1, 16
-
- stxvd2x vs36, o0, T4
- stxvd2x vs37, o16, T4
- stxvd2x vs38, o32, T4
- stxvd2x vs39, o48, T4
-
- addi T4, T4, 64
-
- //############### OFFSET 7 #######################
- xvmuldp vs44, vs44, vs0
- xvmuldp vs45, vs45, vs0
-
- addi T1, T1, 7*SIZE
-
- xvnmsubadp vs46, vs44, vs1
- xvnmsubadp vs47, vs45, vs1
- xvnmsubadp vs48, vs44, vs2
- dcbt T1, PRE
- xvnmsubadp vs49, vs45, vs2
- xvnmsubadp vs50, vs44, vs3
- xvnmsubadp vs51, vs45, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs52, vs44, vs4
- xvnmsubadp vs53, vs45, vs4
- xvnmsubadp vs54, vs44, vs5
- xvnmsubadp vs55, vs45, vs5
- xvnmsubadp vs56, vs44, vs6
- xvnmsubadp vs57, vs45, vs6
- xvnmsubadp vs58, vs44, vs7
- xvnmsubadp vs59, vs45, vs7
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs60, vs44, vs8
- xvnmsubadp vs61, vs45, vs8
- xvnmsubadp vs62, vs44, vs9
- xvnmsubadp vs63, vs45, vs9
-
- lxvdsx vs8, o0, T1
-
- addi T1, T1, 8
-
- //############### OFFSET 8 #######################
- xvmuldp vs46, vs46, vs0
- xvmuldp vs47, vs47, vs0
-
- addi T1, T1, 8*SIZE
-
- xvnmsubadp vs48, vs46, vs1
- xvnmsubadp vs49, vs47, vs1
- xvnmsubadp vs50, vs46, vs2
- dcbt T1, PRE
- xvnmsubadp vs51, vs47, vs2
- xvnmsubadp vs52, vs46, vs3
- xvnmsubadp vs53, vs47, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs54, vs46, vs4
- xvnmsubadp vs55, vs47, vs4
- xvnmsubadp vs56, vs46, vs5
- xvnmsubadp vs57, vs47, vs5
- xvnmsubadp vs58, vs46, vs6
- xvnmsubadp vs59, vs47, vs6
- xvnmsubadp vs60, vs46, vs7
- xvnmsubadp vs61, vs47, vs7
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- stxvd2x vs40, o0, T4
- stxvd2x vs41, o16, T4
- stxvd2x vs42, o32, T4
- stxvd2x vs43, o48, T4
-
- addi T4, T4, 64
-
- xvnmsubadp vs62, vs46, vs8
- xvnmsubadp vs63, vs47, vs8
-
-
- //############### OFFSET 9 #######################
- xvmuldp vs48, vs48, vs0
- xvmuldp vs49, vs49, vs0
-
- addi T1, T1, 9*SIZE
-
- xvnmsubadp vs50, vs48, vs1
- xvnmsubadp vs51, vs49, vs1
- xvnmsubadp vs52, vs48, vs2
- dcbt T1, PRE
- xvnmsubadp vs53, vs49, vs2
- xvnmsubadp vs54, vs48, vs3
- xvnmsubadp vs55, vs49, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs56, vs48, vs4
- xvnmsubadp vs57, vs49, vs4
- xvnmsubadp vs58, vs48, vs5
- xvnmsubadp vs59, vs49, vs5
- xvnmsubadp vs60, vs48, vs6
- xvnmsubadp vs61, vs49, vs6
- xvnmsubadp vs62, vs48, vs7
- xvnmsubadp vs63, vs49, vs7
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
-
- addi T1, T1, 24
-
- //############### OFFSET 10 #######################
- xvmuldp vs50, vs50, vs0
- xvmuldp vs51, vs51, vs0
-
- addi T1, T1, 10*SIZE
-
- xvnmsubadp vs52, vs50, vs1
- xvnmsubadp vs53, vs51, vs1
- xvnmsubadp vs54, vs50, vs2
- dcbt T1, PRE
- xvnmsubadp vs55, vs51, vs2
- xvnmsubadp vs56, vs50, vs3
- xvnmsubadp vs57, vs51, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs58, vs50, vs4
- xvnmsubadp vs59, vs51, vs4
- xvnmsubadp vs60, vs50, vs5
- xvnmsubadp vs61, vs51, vs5
- xvnmsubadp vs62, vs50, vs6
- xvnmsubadp vs63, vs51, vs6
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
-
- addi T1, T1, 16
-
- stxvd2x vs44, o0, T4
- stxvd2x vs45, o16, T4
- stxvd2x vs46, o32, T4
- stxvd2x vs47, o48, T4
-
- addi T4, T4, 64
-
- //############### OFFSET 11 #######################
- xvmuldp vs52, vs52, vs0
- xvmuldp vs53, vs53, vs0
-
- addi T1, T1, 11*SIZE
-
- xvnmsubadp vs54, vs52, vs1
- xvnmsubadp vs55, vs53, vs1
- xvnmsubadp vs56, vs52, vs2
- dcbt T1, PRE
- xvnmsubadp vs57, vs53, vs2
- xvnmsubadp vs58, vs52, vs3
- xvnmsubadp vs59, vs53, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvnmsubadp vs60, vs52, vs4
- xvnmsubadp vs61, vs53, vs4
- xvnmsubadp vs62, vs52, vs5
- xvnmsubadp vs63, vs53, vs5
-
- lxvdsx vs4, o0, T1
-
- addi T1, T1, 8
-
- //############### OFFSET 12 #######################
- xvmuldp vs54, vs54, vs0
- xvmuldp vs55, vs55, vs0
-
- addi T1, T1, 12*SIZE
-
- xvnmsubadp vs56, vs54, vs1
- xvnmsubadp vs57, vs55, vs1
- xvnmsubadp vs58, vs54, vs2
- dcbt T1, PRE
- xvnmsubadp vs59, vs55, vs2
- xvnmsubadp vs60, vs54, vs3
- xvnmsubadp vs61, vs55, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- stxvd2x vs48, o0, T4
- stxvd2x vs49, o16, T4
- stxvd2x vs50, o32, T4
- stxvd2x vs51, o48, T4
-
- addi T4, T4, 64
-
- xvnmsubadp vs62, vs54, vs4
- xvnmsubadp vs63, vs55, vs4
-
-
- //############### OFFSET 13 #######################
- xvmuldp vs56, vs56, vs0
- xvmuldp vs57, vs57, vs0
-
- addi T1, T1, 13*SIZE
-
- xvnmsubadp vs58, vs56, vs1
- xvnmsubadp vs59, vs57, vs1
- xvnmsubadp vs60, vs56, vs2
- xvnmsubadp vs61, vs57, vs2
- xvnmsubadp vs62, vs56, vs3
- xvnmsubadp vs63, vs57, vs3
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
-
- addi T1, T1, 24
-
- //############### OFFSET 14 #######################
- xvmuldp vs58, vs58, vs0
- xvmuldp vs59, vs59, vs0
-
- addi T1, T1, 14*SIZE
-
- xvnmsubadp vs60, vs58, vs1
- xvnmsubadp vs61, vs59, vs1
- xvnmsubadp vs62, vs58, vs2
- xvnmsubadp vs63, vs59, vs2
-
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
-
- addi T1, T1, 16
-
- stxvd2x vs52, o0, T4
- stxvd2x vs53, o16, T4
- stxvd2x vs54, o32, T4
- stxvd2x vs55, o48, T4
-
- addi T4, T4, 64
- //############### OFFSET 15 #######################
- xvmuldp vs60, vs60, vs0
- xvmuldp vs61, vs61, vs0
-
- addi T1, T1, 15*SIZE
-
- xvnmsubadp vs62, vs60, vs1
- xvnmsubadp vs63, vs61, vs1
-
- lxvdsx vs0, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs62, vs62, vs0
- xvmuldp vs63, vs63, vs0
-
-
- //############### SAVE B #######################
-
-
-
- stxvd2x vs56, o0, T4
- stxvd2x vs57, o16, T4
- stxvd2x vs58, o32, T4
- stxvd2x vs59, o48, T4
-
- addi T4, T4, 64
-
- stxvd2x vs60, o0, T4
- stxvd2x vs61, o16, T4
- stxvd2x vs62, o32, T4
- stxvd2x vs63, o48, T4
-
- //############### SAVE C #######################
-
-
- mr T1, CO
- add T2, CO, LDC
-
-
- stxsdx vs32, o0, T1
- XXSWAPD(vs32,vs32)
- stxsdx vs34, o8, T1
- XXSWAPD(vs34,vs34)
- stxsdx vs36, o16, T1
- XXSWAPD(vs36,vs36)
- stxsdx vs38, o24, T1
- XXSWAPD(vs38,vs38)
-
- addi T1, T1, 32
-
- stxsdx vs40, o0, T1
- XXSWAPD(vs40,vs40)
- stxsdx vs42, o8, T1
- XXSWAPD(vs42,vs42)
- stxsdx vs44, o16, T1
- XXSWAPD(vs44,vs44)
- stxsdx vs46, o24, T1
- XXSWAPD(vs46,vs46)
-
- addi T1, T1, 32
-
- stxsdx vs48, o0, T1
- XXSWAPD(vs48,vs48)
- stxsdx vs50, o8, T1
- XXSWAPD(vs50,vs50)
- stxsdx vs52, o16, T1
- XXSWAPD(vs52,vs52)
- stxsdx vs54, o24, T1
- XXSWAPD(vs54,vs54)
-
- addi T1, T1, 32
-
- stxsdx vs56, o0, T1
- XXSWAPD(vs56,vs56)
- stxsdx vs58, o8, T1
- XXSWAPD(vs58,vs58)
- stxsdx vs60, o16, T1
- XXSWAPD(vs60,vs60)
- stxsdx vs62, o24, T1
- XXSWAPD(vs62,vs62)
-
- stxsdx vs32, o0, T2
- stxsdx vs34, o8, T2
- stxsdx vs36, o16, T2
- stxsdx vs38, o24, T2
-
- addi T2, T2, 32
-
- stxsdx vs40, o0, T2
- stxsdx vs42, o8, T2
- stxsdx vs44, o16, T2
- stxsdx vs46, o24, T2
-
- addi T2, T2, 32
-
- stxsdx vs48, o0, T2
- stxsdx vs50, o8, T2
- stxsdx vs52, o16, T2
- stxsdx vs54, o24, T2
-
- addi T2, T2, 32
-
- stxsdx vs56, o0, T2
- stxsdx vs58, o8, T2
- stxsdx vs60, o16, T2
- stxsdx vs62, o24, T2
-
- mr T1, CO
- add T2, CO, LDC
-
-
- add T1, T2, LDC
- add T2, T1, LDC
-
-
- stxsdx vs33, o0, T1
- XXSWAPD(vs33,vs33)
- stxsdx vs35, o8, T1
- XXSWAPD(vs35,vs35)
- stxsdx vs37, o16, T1
- XXSWAPD(vs37,vs37)
- stxsdx vs39, o24, T1
- XXSWAPD(vs39,vs39)
-
- addi T1, T1, 32
-
- stxsdx vs41, o0, T1
- XXSWAPD(vs41,vs41)
- stxsdx vs43, o8, T1
- XXSWAPD(vs43,vs43)
- stxsdx vs45, o16, T1
- XXSWAPD(vs45,vs45)
- stxsdx vs47, o24, T1
- XXSWAPD(vs47,vs47)
-
- addi T1, T1, 32
-
- stxsdx vs49, o0, T1
- XXSWAPD(vs49,vs49)
- stxsdx vs51, o8, T1
- XXSWAPD(vs51,vs51)
- stxsdx vs53, o16, T1
- XXSWAPD(vs53,vs53)
- stxsdx vs55, o24, T1
- XXSWAPD(vs55,vs55)
-
- addi T1, T1, 32
-
- stxsdx vs57, o0, T1
- XXSWAPD(vs57,vs57)
- stxsdx vs59, o8, T1
- XXSWAPD(vs59,vs59)
- stxsdx vs61, o16, T1
- XXSWAPD(vs61,vs61)
- stxsdx vs63, o24, T1
- XXSWAPD(vs63,vs63)
-
- stxsdx vs33, o0, T2
- stxsdx vs35, o8, T2
- stxsdx vs37, o16, T2
- stxsdx vs39, o24, T2
-
- addi T2, T2, 32
-
- stxsdx vs41, o0, T2
- stxsdx vs43, o8, T2
- stxsdx vs45, o16, T2
- stxsdx vs47, o24, T2
-
- addi T2, T2, 32
-
- stxsdx vs49, o0, T2
- stxsdx vs51, o8, T2
- stxsdx vs53, o16, T2
- stxsdx vs55, o24, T2
-
- addi T2, T2, 32
-
- stxsdx vs57, o0, T2
- stxsdx vs59, o8, T2
- stxsdx vs61, o16, T2
- stxsdx vs63, o24, T2
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 8x4
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_8x4', `
- #else
- .macro SOLVE_LT_8x4
- #endif
-
- xxpermdi vs0, vs32, vs33, 0
- xxpermdi vs1, vs34, vs35, 0
- xxpermdi vs2, vs32, vs33, 3
- xxpermdi vs3, vs34, vs35, 3
-
- xxpermdi vs4, vs36, vs37, 0
- xxpermdi vs5, vs38, vs39, 0
- xxpermdi vs6, vs36, vs37, 3
- xxpermdi vs7, vs38, vs39, 3
-
- xxpermdi vs8, vs40, vs41, 0
- xxpermdi vs9, vs42, vs43, 0
- xxpermdi vs10, vs40, vs41, 3
- xxpermdi vs11, vs42, vs43, 3
-
- xxpermdi vs12, vs44, vs45, 0
- xxpermdi vs13, vs46, vs47, 0
- xxpermdi vs14, vs44, vs45, 3
- xxpermdi vs15, vs46, vs47, 3
-
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxvd2x vs32, o0, T1
- lxvd2x vs33, o16, T1
- lxvd2x vs34, o32, T1
- lxvd2x vs35, o48, T1
-
- addi T1, T1, 64
-
- lxvd2x vs36, o0, T1
- lxvd2x vs37, o16, T1
- lxvd2x vs38, o32, T1
- lxvd2x vs39, o48, T1
-
- addi T1, T1, 64
-
- lxvd2x vs40, o0, T1
- lxvd2x vs41, o16, T1
- lxvd2x vs42, o32, T1
- lxvd2x vs43, o48, T1
-
- addi T1, T1, 64
-
- lxvd2x vs44, o0, T1
- lxvd2x vs45, o16, T1
- lxvd2x vs46, o32, T1
- lxvd2x vs47, o48, T1
-
- xvsubdp vs32, vs32, vs0
- xvsubdp vs33, vs33, vs1
- xvsubdp vs34, vs34, vs2
- xvsubdp vs35, vs35, vs3
- xvsubdp vs36, vs36, vs4
- xvsubdp vs37, vs37, vs5
- xvsubdp vs38, vs38, vs6
- xvsubdp vs39, vs39, vs7
- xvsubdp vs40, vs40, vs8
- xvsubdp vs41, vs41, vs9
- xvsubdp vs42, vs42, vs10
- xvsubdp vs43, vs43, vs11
- xvsubdp vs44, vs44, vs12
- xvsubdp vs45, vs45, vs13
- xvsubdp vs46, vs46, vs14
- xvsubdp vs47, vs47, vs15
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- xvmuldp vs32, vs32, vs0
- xvmuldp vs33, vs33, vs0
-
- xvnmsubadp vs34, vs32, vs1
- xvnmsubadp vs35, vs33, vs1
- xvnmsubadp vs36, vs32, vs2
- xvnmsubadp vs37, vs33, vs2
- xvnmsubadp vs38, vs32, vs3
- xvnmsubadp vs39, vs33, vs3
- xvnmsubadp vs40, vs32, vs4
- xvnmsubadp vs41, vs33, vs4
- xvnmsubadp vs42, vs32, vs5
- xvnmsubadp vs43, vs33, vs5
- xvnmsubadp vs44, vs32, vs6
- xvnmsubadp vs45, vs33, vs6
- xvnmsubadp vs46, vs32, vs7
- xvnmsubadp vs47, vs33, vs7
-
- //############### OFFSET 1 #######################
-
- addi T1, T1, 1*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
-
- addi T1, T1, 24
-
- xvmuldp vs34, vs34, vs0
- xvmuldp vs35, vs35, vs0
-
- xvnmsubadp vs36, vs34, vs1
- xvnmsubadp vs37, vs35, vs1
- xvnmsubadp vs38, vs34, vs2
- xvnmsubadp vs39, vs35, vs2
- xvnmsubadp vs40, vs34, vs3
- xvnmsubadp vs41, vs35, vs3
- xvnmsubadp vs42, vs34, vs4
- xvnmsubadp vs43, vs35, vs4
- xvnmsubadp vs44, vs34, vs5
- xvnmsubadp vs45, vs35, vs5
- xvnmsubadp vs46, vs34, vs6
- xvnmsubadp vs47, vs35, vs6
-
- //############### OFFSET 2 #######################
-
- addi T1, T1, 2*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
-
- addi T1, T1, 16
-
- xvmuldp vs36, vs36, vs0
- xvmuldp vs37, vs37, vs0
-
- xvnmsubadp vs38, vs36, vs1
- xvnmsubadp vs39, vs37, vs1
- xvnmsubadp vs40, vs36, vs2
- xvnmsubadp vs41, vs37, vs2
- xvnmsubadp vs42, vs36, vs3
- xvnmsubadp vs43, vs37, vs3
- xvnmsubadp vs44, vs36, vs4
- xvnmsubadp vs45, vs37, vs4
- xvnmsubadp vs46, vs36, vs5
- xvnmsubadp vs47, vs37, vs5
-
- //############### OFFSET 3 #######################
-
- addi T1, T1, 3*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs38, vs38, vs0
- xvmuldp vs39, vs39, vs0
-
- xvnmsubadp vs40, vs38, vs1
- xvnmsubadp vs41, vs39, vs1
- xvnmsubadp vs42, vs38, vs2
- xvnmsubadp vs43, vs39, vs2
- xvnmsubadp vs44, vs38, vs3
- xvnmsubadp vs45, vs39, vs3
- xvnmsubadp vs46, vs38, vs4
- xvnmsubadp vs47, vs39, vs4
-
- //############### OFFSET 4 #######################
-
- addi T1, T1, 4*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvmuldp vs40, vs40, vs0
- xvmuldp vs41, vs41, vs0
-
- xvnmsubadp vs42, vs40, vs1
- xvnmsubadp vs43, vs41, vs1
- xvnmsubadp vs44, vs40, vs2
- xvnmsubadp vs45, vs41, vs2
- xvnmsubadp vs46, vs40, vs3
- xvnmsubadp vs47, vs41, vs3
-
- //############### OFFSET 5 #######################
-
- addi T1, T1, 5*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
-
- addi T1, T1, 24
-
- xvmuldp vs42, vs42, vs0
- xvmuldp vs43, vs43, vs0
-
- xvnmsubadp vs44, vs42, vs1
- xvnmsubadp vs45, vs43, vs1
- xvnmsubadp vs46, vs42, vs2
- xvnmsubadp vs47, vs43, vs2
-
- //############### OFFSET 6 #######################
-
- addi T1, T1, 6*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
-
- addi T1, T1, 16
-
- xvmuldp vs44, vs44, vs0
- xvmuldp vs45, vs45, vs0
-
- xvnmsubadp vs46, vs44, vs1
- xvnmsubadp vs47, vs45, vs1
-
- //############### OFFSET 7 #######################
-
- addi T1, T1, 7*SIZE
-
- lxvdsx vs0, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs46, vs46, vs0
- xvmuldp vs47, vs47, vs0
-
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxvd2x vs32, o0, T1
- stxvd2x vs33, o16, T1
- stxvd2x vs34, o32, T1
- stxvd2x vs35, o48, T1
-
- addi T1, T1, 64
-
- stxvd2x vs36, o0, T1
- stxvd2x vs37, o16, T1
- stxvd2x vs38, o32, T1
- stxvd2x vs39, o48, T1
-
- addi T1, T1, 64
-
- stxvd2x vs40, o0, T1
- stxvd2x vs41, o16, T1
- stxvd2x vs42, o32, T1
- stxvd2x vs43, o48, T1
-
- addi T1, T1, 64
-
- stxvd2x vs44, o0, T1
- stxvd2x vs45, o16, T1
- stxvd2x vs46, o32, T1
- stxvd2x vs47, o48, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
- add T2, CO, LDC
-
-
- stxsdx vs32, o0, T1
- XXSWAPD(vs32,vs32)
- stxsdx vs34, o8, T1
- XXSWAPD(vs34,vs34)
- stxsdx vs36, o16, T1
- XXSWAPD(vs36,vs36)
- stxsdx vs38, o24, T1
- XXSWAPD(vs38,vs38)
-
- addi T1, T1, 32
-
- stxsdx vs40, o0, T1
- XXSWAPD(vs40,vs40)
- stxsdx vs42, o8, T1
- XXSWAPD(vs42,vs42)
- stxsdx vs44, o16, T1
- XXSWAPD(vs44,vs44)
- stxsdx vs46, o24, T1
- XXSWAPD(vs46,vs46)
-
- stxsdx vs32, o0, T2
- stxsdx vs34, o8, T2
- stxsdx vs36, o16, T2
- stxsdx vs38, o24, T2
-
- addi T2, T2, 32
-
- stxsdx vs40, o0, T2
- stxsdx vs42, o8, T2
- stxsdx vs44, o16, T2
- stxsdx vs46, o24, T2
-
- mr T1, CO
- add T2, CO, LDC
-
-
- add T1, T2, LDC
- add T2, T1, LDC
-
-
- stxsdx vs33, o0, T1
- XXSWAPD(vs33,vs33)
- stxsdx vs35, o8, T1
- XXSWAPD(vs35,vs35)
- stxsdx vs37, o16, T1
- XXSWAPD(vs37,vs37)
- stxsdx vs39, o24, T1
- XXSWAPD(vs39,vs39)
-
- addi T1, T1, 32
-
- stxsdx vs41, o0, T1
- XXSWAPD(vs41,vs41)
- stxsdx vs43, o8, T1
- XXSWAPD(vs43,vs43)
- stxsdx vs45, o16, T1
- XXSWAPD(vs45,vs45)
- stxsdx vs47, o24, T1
- XXSWAPD(vs47,vs47)
-
- stxsdx vs33, o0, T2
- stxsdx vs35, o8, T2
- stxsdx vs37, o16, T2
- stxsdx vs39, o24, T2
-
- addi T2, T2, 32
-
- stxsdx vs41, o0, T2
- stxsdx vs43, o8, T2
- stxsdx vs45, o16, T2
- stxsdx vs47, o24, T2
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 4x4
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_4x4', `
- #else
- .macro SOLVE_LT_4x4
- #endif
-
- xxpermdi vs0, vs32, vs33, 0
- xxpermdi vs1, vs34, vs35, 0
- xxpermdi vs2, vs32, vs33, 3
- xxpermdi vs3, vs34, vs35, 3
-
- xxpermdi vs4, vs36, vs37, 0
- xxpermdi vs5, vs38, vs39, 0
- xxpermdi vs6, vs36, vs37, 3
- xxpermdi vs7, vs38, vs39, 3
-
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxvd2x vs32, o0, T1
- lxvd2x vs33, o16, T1
- lxvd2x vs34, o32, T1
- lxvd2x vs35, o48, T1
-
- addi T1, T1, 64
-
- lxvd2x vs36, o0, T1
- lxvd2x vs37, o16, T1
- lxvd2x vs38, o32, T1
- lxvd2x vs39, o48, T1
-
- xvsubdp vs32, vs32, vs0
- xvsubdp vs33, vs33, vs1
- xvsubdp vs34, vs34, vs2
- xvsubdp vs35, vs35, vs3
- xvsubdp vs36, vs36, vs4
- xvsubdp vs37, vs37, vs5
- xvsubdp vs38, vs38, vs6
- xvsubdp vs39, vs39, vs7
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvmuldp vs32, vs32, vs0
- xvmuldp vs33, vs33, vs0
-
- xvnmsubadp vs34, vs32, vs1
- xvnmsubadp vs35, vs33, vs1
- xvnmsubadp vs36, vs32, vs2
- xvnmsubadp vs37, vs33, vs2
- xvnmsubadp vs38, vs32, vs3
- xvnmsubadp vs39, vs33, vs3
-
- //############### OFFSET 1 #######################
-
- addi T1, T1, 1*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
-
- addi T1, T1, 24
-
- xvmuldp vs34, vs34, vs0
- xvmuldp vs35, vs35, vs0
-
- xvnmsubadp vs36, vs34, vs1
- xvnmsubadp vs37, vs35, vs1
- xvnmsubadp vs38, vs34, vs2
- xvnmsubadp vs39, vs35, vs2
-
- //############### OFFSET 2 #######################
-
- addi T1, T1, 2*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
-
- addi T1, T1, 16
-
- xvmuldp vs36, vs36, vs0
- xvmuldp vs37, vs37, vs0
-
- xvnmsubadp vs38, vs36, vs1
- xvnmsubadp vs39, vs37, vs1
-
- //############### OFFSET 3 #######################
-
- addi T1, T1, 3*SIZE
-
- lxvdsx vs0, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs38, vs38, vs0
- xvmuldp vs39, vs39, vs0
-
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxvd2x vs32, o0, T1
- stxvd2x vs33, o16, T1
- stxvd2x vs34, o32, T1
- stxvd2x vs35, o48, T1
-
- addi T1, T1, 64
-
- stxvd2x vs36, o0, T1
- stxvd2x vs37, o16, T1
- stxvd2x vs38, o32, T1
- stxvd2x vs39, o48, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
- add T2, CO, LDC
-
-
- stxsdx vs32, o0, T1
- XXSWAPD(vs32,vs32)
- stxsdx vs34, o8, T1
- XXSWAPD(vs34,vs34)
- stxsdx vs36, o16, T1
- XXSWAPD(vs36,vs36)
- stxsdx vs38, o24, T1
- XXSWAPD(vs38,vs38)
-
- stxsdx vs32, o0, T2
- stxsdx vs34, o8, T2
- stxsdx vs36, o16, T2
- stxsdx vs38, o24, T2
-
- mr T1, CO
- add T2, CO, LDC
-
-
- add T1, T2, LDC
- add T2, T1, LDC
-
-
- stxsdx vs33, o0, T1
- XXSWAPD(vs33,vs33)
- stxsdx vs35, o8, T1
- XXSWAPD(vs35,vs35)
- stxsdx vs37, o16, T1
- XXSWAPD(vs37,vs37)
- stxsdx vs39, o24, T1
- XXSWAPD(vs39,vs39)
-
- stxsdx vs33, o0, T2
- stxsdx vs35, o8, T2
- stxsdx vs37, o16, T2
- stxsdx vs39, o24, T2
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 2x4
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_2x4', `
- #else
- .macro SOLVE_LT_2x4
- #endif
-
- xxpermdi vs0, vs32, vs33, 0
- xxpermdi vs1, vs34, vs35, 0
- xxpermdi vs2, vs32, vs33, 3
- xxpermdi vs3, vs34, vs35, 3
-
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxvd2x vs32, o0, T1
- lxvd2x vs33, o16, T1
- lxvd2x vs34, o32, T1
- lxvd2x vs35, o48, T1
-
- xvsubdp vs32, vs32, vs0
- xvsubdp vs33, vs33, vs1
- xvsubdp vs34, vs34, vs2
- xvsubdp vs35, vs35, vs3
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
-
- addi T1, T1, 16
-
- xvmuldp vs32, vs32, vs0
- xvmuldp vs33, vs33, vs0
-
- xvnmsubadp vs34, vs32, vs1
- xvnmsubadp vs35, vs33, vs1
-
- //############### OFFSET 1 #######################
-
- addi T1, T1, 1*SIZE
-
- lxvdsx vs0, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs34, vs34, vs0
- xvmuldp vs35, vs35, vs0
-
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxvd2x vs32, o0, T1
- stxvd2x vs33, o16, T1
- stxvd2x vs34, o32, T1
- stxvd2x vs35, o48, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
- add T2, CO, LDC
-
-
- stxsdx vs32, o0, T1
- XXSWAPD(vs32,vs32)
- stxsdx vs34, o8, T1
- XXSWAPD(vs34,vs34)
-
- stxsdx vs32, o0, T2
- stxsdx vs34, o8, T2
-
- mr T1, CO
- add T2, CO, LDC
-
-
- add T1, T2, LDC
- add T2, T1, LDC
-
-
- stxsdx vs33, o0, T1
- XXSWAPD(vs33,vs33)
- stxsdx vs35, o8, T1
- XXSWAPD(vs35,vs35)
-
- stxsdx vs33, o0, T2
- stxsdx vs35, o8, T2
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 1x4
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_1x4', `
- #else
- .macro SOLVE_LT_1x4
- #endif
-
- xxpermdi vs0, vs32, vs33, 0
- xxpermdi vs1, vs34, vs35, 0
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxvd2x vs32, o0, T1
- lxvd2x vs33, o16, T1
-
- xvsubdp vs32, vs32, vs0
- xvsubdp vs33, vs33, vs1
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxvdsx vs0, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs32, vs32, vs0
- xvmuldp vs33, vs33, vs0
-
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxvd2x vs32, o0, T1
- stxvd2x vs33, o16, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
- add T2, CO, LDC
-
-
- stxsdx vs32, o0, T1
- XXSWAPD(vs32,vs32)
-
- stxsdx vs32, o0, T2
-
- mr T1, CO
- add T2, CO, LDC
-
-
- add T1, T2, LDC
- add T2, T1, LDC
-
-
- stxsdx vs33, o0, T1
- XXSWAPD(vs33,vs33)
-
- stxsdx vs33, o0, T2
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_16x2', `
- #else
- .macro INIT_16x2
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
- XVMOVDP(vs34,vs0)
- XVMOVDP(vs35,vs0)
- XVMOVDP(vs36,vs0)
- XVMOVDP(vs37,vs0)
- XVMOVDP(vs38,vs0)
- XVMOVDP(vs39,vs0)
- XVMOVDP(vs40,vs0)
- XVMOVDP(vs41,vs0)
- XVMOVDP(vs42,vs0)
- XVMOVDP(vs43,vs0)
- XVMOVDP(vs44,vs0)
- XVMOVDP(vs45,vs0)
- XVMOVDP(vs46,vs0)
- XVMOVDP(vs47,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_16x2', `
- #else
- .macro KERNEL_16x2
- #endif
-
-
- lxvd2x vs0, o0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- addi AO, AO, 64
-
- lxvd2x vs4, o0, AO
- lxvd2x vs5, o16, AO
- lxvd2x vs6, o32, AO
- lxvd2x vs7, o48, AO
-
- addi AO, AO, 64
-
- lxvdsx vs16, o0, BO
- lxvdsx vs17, o8, BO
-
- addi BO, BO, 16
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs0, vs17
- xvmaddadp vs34, vs1, vs16
- xvmaddadp vs35, vs1, vs17
- xvmaddadp vs36, vs2, vs16
- xvmaddadp vs37, vs2, vs17
- xvmaddadp vs38, vs3, vs16
- xvmaddadp vs39, vs3, vs17
- xvmaddadp vs40, vs4, vs16
- xvmaddadp vs41, vs4, vs17
- xvmaddadp vs42, vs5, vs16
- xvmaddadp vs43, vs5, vs17
- xvmaddadp vs44, vs6, vs16
- xvmaddadp vs45, vs6, vs17
- xvmaddadp vs46, vs7, vs16
- xvmaddadp vs47, vs7, vs17
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_8x2', `
- #else
- .macro INIT_8x2
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
- XVMOVDP(vs34,vs0)
- XVMOVDP(vs35,vs0)
- XVMOVDP(vs36,vs0)
- XVMOVDP(vs37,vs0)
- XVMOVDP(vs38,vs0)
- XVMOVDP(vs39,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_8x2', `
- #else
- .macro KERNEL_8x2
- #endif
-
-
- lxvd2x vs0, o0, AO
- lxvd2x vs1, o16, AO
- lxvd2x vs2, o32, AO
- lxvd2x vs3, o48, AO
-
- addi AO, AO, 64
-
- lxvdsx vs16, o0, BO
- lxvdsx vs17, o8, BO
-
- addi BO, BO, 16
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs0, vs17
- xvmaddadp vs34, vs1, vs16
- xvmaddadp vs35, vs1, vs17
- xvmaddadp vs36, vs2, vs16
- xvmaddadp vs37, vs2, vs17
- xvmaddadp vs38, vs3, vs16
- xvmaddadp vs39, vs3, vs17
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_4x2', `
- #else
- .macro INIT_4x2
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
- XVMOVDP(vs34,vs0)
- XVMOVDP(vs35,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_4x2', `
- #else
- .macro KERNEL_4x2
- #endif
-
-
- lxvd2x vs0, o0, AO
- lxvd2x vs1, o16, AO
-
- addi AO, AO, 32
-
- lxvdsx vs16, o0, BO
- lxvdsx vs17, o8, BO
-
- addi BO, BO, 16
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs0, vs17
- xvmaddadp vs34, vs1, vs16
- xvmaddadp vs35, vs1, vs17
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_2x2', `
- #else
- .macro INIT_2x2
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_2x2', `
- #else
- .macro KERNEL_2x2
- #endif
-
-
- lxvd2x vs0, o0, AO
-
- addi AO, AO, 16
-
- lxvdsx vs16, o0, BO
- lxvdsx vs17, o8, BO
-
- addi BO, BO, 16
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs0, vs17
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_1x2', `
- #else
- .macro INIT_1x2
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_1x2', `
- #else
- .macro KERNEL_1x2
- #endif
-
-
- lxvdsx vs0, o0, AO
-
- addi AO, AO, 8
-
- lxvdsx vs16, o0, BO
- lxvdsx vs17, o8, BO
-
- addi BO, BO, 16
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs0, vs17
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 16x2
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_16x2', `
- #else
- .macro SOLVE_LT_16x2
- #endif
-
- xxpermdi vs0, vs32, vs33, 0
- xxpermdi vs1, vs32, vs33, 3
-
- xxpermdi vs2, vs34, vs35, 0
- xxpermdi vs3, vs34, vs35, 3
-
- xxpermdi vs4, vs36, vs37, 0
- xxpermdi vs5, vs36, vs37, 3
-
- xxpermdi vs6, vs38, vs39, 0
- xxpermdi vs7, vs38, vs39, 3
-
- xxpermdi vs8, vs40, vs41, 0
- xxpermdi vs9, vs40, vs41, 3
-
- xxpermdi vs10, vs42, vs43, 0
- xxpermdi vs11, vs42, vs43, 3
-
- xxpermdi vs12, vs44, vs45, 0
- xxpermdi vs13, vs44, vs45, 3
-
- xxpermdi vs14, vs46, vs47, 0
- xxpermdi vs15, vs46, vs47, 3
-
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxvd2x vs32, o0, T1
- lxvd2x vs33, o16, T1
- lxvd2x vs34, o32, T1
- lxvd2x vs35, o48, T1
-
- addi T1, T1, 64
-
- lxvd2x vs36, o0, T1
- lxvd2x vs37, o16, T1
- lxvd2x vs38, o32, T1
- lxvd2x vs39, o48, T1
-
- addi T1, T1, 64
-
- lxvd2x vs40, o0, T1
- lxvd2x vs41, o16, T1
- lxvd2x vs42, o32, T1
- lxvd2x vs43, o48, T1
-
- addi T1, T1, 64
-
- lxvd2x vs44, o0, T1
- lxvd2x vs45, o16, T1
- lxvd2x vs46, o32, T1
- lxvd2x vs47, o48, T1
-
- xvsubdp vs32, vs32, vs0
- xvsubdp vs33, vs33, vs1
- xvsubdp vs34, vs34, vs2
- xvsubdp vs35, vs35, vs3
- xvsubdp vs36, vs36, vs4
- xvsubdp vs37, vs37, vs5
- xvsubdp vs38, vs38, vs6
- xvsubdp vs39, vs39, vs7
- xvsubdp vs40, vs40, vs8
- xvsubdp vs41, vs41, vs9
- xvsubdp vs42, vs42, vs10
- xvsubdp vs43, vs43, vs11
- xvsubdp vs44, vs44, vs12
- xvsubdp vs45, vs45, vs13
- xvsubdp vs46, vs46, vs14
- xvsubdp vs47, vs47, vs15
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
- lxvdsx vs10, o16, T1
- lxvdsx vs11, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs12, o0, T1
- lxvdsx vs13, o8, T1
- lxvdsx vs14, o16, T1
- lxvdsx vs15, o24, T1
-
- addi T1, T1, 32
-
- xvmuldp vs32, vs32, vs0
- xvnmsubadp vs33, vs32, vs1
- xvnmsubadp vs34, vs32, vs2
- xvnmsubadp vs35, vs32, vs3
- xvnmsubadp vs36, vs32, vs4
- xvnmsubadp vs37, vs32, vs5
- xvnmsubadp vs38, vs32, vs6
- xvnmsubadp vs39, vs32, vs7
- xvnmsubadp vs40, vs32, vs8
- xvnmsubadp vs41, vs32, vs9
- xvnmsubadp vs42, vs32, vs10
- xvnmsubadp vs43, vs32, vs11
- xvnmsubadp vs44, vs32, vs12
- xvnmsubadp vs45, vs32, vs13
- xvnmsubadp vs46, vs32, vs14
- xvnmsubadp vs47, vs32, vs15
-
- //############### OFFSET 1 #######################
-
- addi T1, T1, 1*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
- lxvdsx vs10, o16, T1
- lxvdsx vs11, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs12, o0, T1
- lxvdsx vs13, o8, T1
- lxvdsx vs14, o16, T1
-
- addi T1, T1, 24
-
- xvmuldp vs33, vs33, vs0
- xvnmsubadp vs34, vs33, vs1
- xvnmsubadp vs35, vs33, vs2
- xvnmsubadp vs36, vs33, vs3
- xvnmsubadp vs37, vs33, vs4
- xvnmsubadp vs38, vs33, vs5
- xvnmsubadp vs39, vs33, vs6
- xvnmsubadp vs40, vs33, vs7
- xvnmsubadp vs41, vs33, vs8
- xvnmsubadp vs42, vs33, vs9
- xvnmsubadp vs43, vs33, vs10
- xvnmsubadp vs44, vs33, vs11
- xvnmsubadp vs45, vs33, vs12
- xvnmsubadp vs46, vs33, vs13
- xvnmsubadp vs47, vs33, vs14
-
- //############### OFFSET 2 #######################
-
- addi T1, T1, 2*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
- lxvdsx vs10, o16, T1
- lxvdsx vs11, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs12, o0, T1
- lxvdsx vs13, o8, T1
-
- addi T1, T1, 16
-
- xvmuldp vs34, vs34, vs0
- xvnmsubadp vs35, vs34, vs1
- xvnmsubadp vs36, vs34, vs2
- xvnmsubadp vs37, vs34, vs3
- xvnmsubadp vs38, vs34, vs4
- xvnmsubadp vs39, vs34, vs5
- xvnmsubadp vs40, vs34, vs6
- xvnmsubadp vs41, vs34, vs7
- xvnmsubadp vs42, vs34, vs8
- xvnmsubadp vs43, vs34, vs9
- xvnmsubadp vs44, vs34, vs10
- xvnmsubadp vs45, vs34, vs11
- xvnmsubadp vs46, vs34, vs12
- xvnmsubadp vs47, vs34, vs13
-
- //############### OFFSET 3 #######################
-
- addi T1, T1, 3*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
- lxvdsx vs10, o16, T1
- lxvdsx vs11, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs12, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs35, vs35, vs0
- xvnmsubadp vs36, vs35, vs1
- xvnmsubadp vs37, vs35, vs2
- xvnmsubadp vs38, vs35, vs3
- xvnmsubadp vs39, vs35, vs4
- xvnmsubadp vs40, vs35, vs5
- xvnmsubadp vs41, vs35, vs6
- xvnmsubadp vs42, vs35, vs7
- xvnmsubadp vs43, vs35, vs8
- xvnmsubadp vs44, vs35, vs9
- xvnmsubadp vs45, vs35, vs10
- xvnmsubadp vs46, vs35, vs11
- xvnmsubadp vs47, vs35, vs12
-
- //############### OFFSET 4 #######################
-
- addi T1, T1, 4*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
- lxvdsx vs10, o16, T1
- lxvdsx vs11, o24, T1
-
- addi T1, T1, 32
-
- xvmuldp vs36, vs36, vs0
- xvnmsubadp vs37, vs36, vs1
- xvnmsubadp vs38, vs36, vs2
- xvnmsubadp vs39, vs36, vs3
- xvnmsubadp vs40, vs36, vs4
- xvnmsubadp vs41, vs36, vs5
- xvnmsubadp vs42, vs36, vs6
- xvnmsubadp vs43, vs36, vs7
- xvnmsubadp vs44, vs36, vs8
- xvnmsubadp vs45, vs36, vs9
- xvnmsubadp vs46, vs36, vs10
- xvnmsubadp vs47, vs36, vs11
-
- //############### OFFSET 5 #######################
-
- addi T1, T1, 5*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
- lxvdsx vs10, o16, T1
-
- addi T1, T1, 24
-
- xvmuldp vs37, vs37, vs0
- xvnmsubadp vs38, vs37, vs1
- xvnmsubadp vs39, vs37, vs2
- xvnmsubadp vs40, vs37, vs3
- xvnmsubadp vs41, vs37, vs4
- xvnmsubadp vs42, vs37, vs5
- xvnmsubadp vs43, vs37, vs6
- xvnmsubadp vs44, vs37, vs7
- xvnmsubadp vs45, vs37, vs8
- xvnmsubadp vs46, vs37, vs9
- xvnmsubadp vs47, vs37, vs10
-
- //############### OFFSET 6 #######################
-
- addi T1, T1, 6*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs8, o0, T1
- lxvdsx vs9, o8, T1
-
- addi T1, T1, 16
-
- xvmuldp vs38, vs38, vs0
- xvnmsubadp vs39, vs38, vs1
- xvnmsubadp vs40, vs38, vs2
- xvnmsubadp vs41, vs38, vs3
- xvnmsubadp vs42, vs38, vs4
- xvnmsubadp vs43, vs38, vs5
- xvnmsubadp vs44, vs38, vs6
- xvnmsubadp vs45, vs38, vs7
- xvnmsubadp vs46, vs38, vs8
- xvnmsubadp vs47, vs38, vs9
-
- //############### OFFSET 7 #######################
-
- addi T1, T1, 7*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs8, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs39, vs39, vs0
- xvnmsubadp vs40, vs39, vs1
- xvnmsubadp vs41, vs39, vs2
- xvnmsubadp vs42, vs39, vs3
- xvnmsubadp vs43, vs39, vs4
- xvnmsubadp vs44, vs39, vs5
- xvnmsubadp vs45, vs39, vs6
- xvnmsubadp vs46, vs39, vs7
- xvnmsubadp vs47, vs39, vs8
-
- //############### OFFSET 8 #######################
-
- addi T1, T1, 8*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- xvmuldp vs40, vs40, vs0
- xvnmsubadp vs41, vs40, vs1
- xvnmsubadp vs42, vs40, vs2
- xvnmsubadp vs43, vs40, vs3
- xvnmsubadp vs44, vs40, vs4
- xvnmsubadp vs45, vs40, vs5
- xvnmsubadp vs46, vs40, vs6
- xvnmsubadp vs47, vs40, vs7
-
- //############### OFFSET 9 #######################
-
- addi T1, T1, 9*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
-
- addi T1, T1, 24
-
- xvmuldp vs41, vs41, vs0
- xvnmsubadp vs42, vs41, vs1
- xvnmsubadp vs43, vs41, vs2
- xvnmsubadp vs44, vs41, vs3
- xvnmsubadp vs45, vs41, vs4
- xvnmsubadp vs46, vs41, vs5
- xvnmsubadp vs47, vs41, vs6
-
- //############### OFFSET 10 #######################
-
- addi T1, T1, 10*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
-
- addi T1, T1, 16
-
- xvmuldp vs42, vs42, vs0
- xvnmsubadp vs43, vs42, vs1
- xvnmsubadp vs44, vs42, vs2
- xvnmsubadp vs45, vs42, vs3
- xvnmsubadp vs46, vs42, vs4
- xvnmsubadp vs47, vs42, vs5
-
- //############### OFFSET 11 #######################
-
- addi T1, T1, 11*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs43, vs43, vs0
- xvnmsubadp vs44, vs43, vs1
- xvnmsubadp vs45, vs43, vs2
- xvnmsubadp vs46, vs43, vs3
- xvnmsubadp vs47, vs43, vs4
-
- //############### OFFSET 12 #######################
-
- addi T1, T1, 12*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvmuldp vs44, vs44, vs0
- xvnmsubadp vs45, vs44, vs1
- xvnmsubadp vs46, vs44, vs2
- xvnmsubadp vs47, vs44, vs3
-
- //############### OFFSET 13 #######################
-
- addi T1, T1, 13*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
-
- addi T1, T1, 24
-
- xvmuldp vs45, vs45, vs0
- xvnmsubadp vs46, vs45, vs1
- xvnmsubadp vs47, vs45, vs2
-
- //############### OFFSET 14 #######################
-
- addi T1, T1, 14*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
-
- addi T1, T1, 16
-
- xvmuldp vs46, vs46, vs0
- xvnmsubadp vs47, vs46, vs1
-
- //############### OFFSET 15 #######################
-
- addi T1, T1, 15*SIZE
-
- lxvdsx vs0, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs47, vs47, vs0
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxvd2x vs32, o0, T1
- stxvd2x vs33, o16, T1
- stxvd2x vs34, o32, T1
- stxvd2x vs35, o48, T1
-
- addi T1, T1, 64
-
- stxvd2x vs36, o0, T1
- stxvd2x vs37, o16, T1
- stxvd2x vs38, o32, T1
- stxvd2x vs39, o48, T1
-
- addi T1, T1, 64
-
- stxvd2x vs40, o0, T1
- stxvd2x vs41, o16, T1
- stxvd2x vs42, o32, T1
- stxvd2x vs43, o48, T1
-
- addi T1, T1, 64
-
- stxvd2x vs44, o0, T1
- stxvd2x vs45, o16, T1
- stxvd2x vs46, o32, T1
- stxvd2x vs47, o48, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
- add T2, CO, LDC
-
-
- stxsdx vs32, o0, T1
- XXSWAPD(vs32,vs32)
- stxsdx vs33, o8, T1
- XXSWAPD(vs33,vs33)
- stxsdx vs34, o16, T1
- XXSWAPD(vs34,vs34)
- stxsdx vs35, o24, T1
- XXSWAPD(vs35,vs35)
-
- addi T1, T1, 32
-
- stxsdx vs36, o0, T1
- XXSWAPD(vs36,vs36)
- stxsdx vs37, o8, T1
- XXSWAPD(vs37,vs37)
- stxsdx vs38, o16, T1
- XXSWAPD(vs38,vs38)
- stxsdx vs39, o24, T1
- XXSWAPD(vs39,vs39)
-
- addi T1, T1, 32
-
- stxsdx vs40, o0, T1
- XXSWAPD(vs40,vs40)
- stxsdx vs41, o8, T1
- XXSWAPD(vs41,vs41)
- stxsdx vs42, o16, T1
- XXSWAPD(vs42,vs42)
- stxsdx vs43, o24, T1
- XXSWAPD(vs43,vs43)
-
- addi T1, T1, 32
-
- stxsdx vs44, o0, T1
- XXSWAPD(vs44,vs44)
- stxsdx vs45, o8, T1
- XXSWAPD(vs45,vs45)
- stxsdx vs46, o16, T1
- XXSWAPD(vs46,vs46)
- stxsdx vs47, o24, T1
- XXSWAPD(vs47,vs47)
-
- stxsdx vs32, o0, T2
- stxsdx vs33, o8, T2
- stxsdx vs34, o16, T2
- stxsdx vs35, o24, T2
-
- addi T2, T2, 32
-
- stxsdx vs36, o0, T2
- stxsdx vs37, o8, T2
- stxsdx vs38, o16, T2
- stxsdx vs39, o24, T2
-
- addi T2, T2, 32
-
- stxsdx vs40, o0, T2
- stxsdx vs41, o8, T2
- stxsdx vs42, o16, T2
- stxsdx vs43, o24, T2
-
- addi T2, T2, 32
-
- stxsdx vs44, o0, T2
- stxsdx vs45, o8, T2
- stxsdx vs46, o16, T2
- stxsdx vs47, o24, T2
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 8x2
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_8x2', `
- #else
- .macro SOLVE_LT_8x2
- #endif
-
- xxpermdi vs0, vs32, vs33, 0
- xxpermdi vs1, vs32, vs33, 3
-
- xxpermdi vs2, vs34, vs35, 0
- xxpermdi vs3, vs34, vs35, 3
-
- xxpermdi vs4, vs36, vs37, 0
- xxpermdi vs5, vs36, vs37, 3
-
- xxpermdi vs6, vs38, vs39, 0
- xxpermdi vs7, vs38, vs39, 3
-
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxvd2x vs32, o0, T1
- lxvd2x vs33, o16, T1
- lxvd2x vs34, o32, T1
- lxvd2x vs35, o48, T1
-
- addi T1, T1, 64
-
- lxvd2x vs36, o0, T1
- lxvd2x vs37, o16, T1
- lxvd2x vs38, o32, T1
- lxvd2x vs39, o48, T1
-
- xvsubdp vs32, vs32, vs0
- xvsubdp vs33, vs33, vs1
- xvsubdp vs34, vs34, vs2
- xvsubdp vs35, vs35, vs3
- xvsubdp vs36, vs36, vs4
- xvsubdp vs37, vs37, vs5
- xvsubdp vs38, vs38, vs6
- xvsubdp vs39, vs39, vs7
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
- lxvdsx vs7, o24, T1
-
- addi T1, T1, 32
-
- xvmuldp vs32, vs32, vs0
- xvnmsubadp vs33, vs32, vs1
- xvnmsubadp vs34, vs32, vs2
- xvnmsubadp vs35, vs32, vs3
- xvnmsubadp vs36, vs32, vs4
- xvnmsubadp vs37, vs32, vs5
- xvnmsubadp vs38, vs32, vs6
- xvnmsubadp vs39, vs32, vs7
-
- //############### OFFSET 1 #######################
-
- addi T1, T1, 1*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
- lxvdsx vs6, o16, T1
-
- addi T1, T1, 24
-
- xvmuldp vs33, vs33, vs0
- xvnmsubadp vs34, vs33, vs1
- xvnmsubadp vs35, vs33, vs2
- xvnmsubadp vs36, vs33, vs3
- xvnmsubadp vs37, vs33, vs4
- xvnmsubadp vs38, vs33, vs5
- xvnmsubadp vs39, vs33, vs6
-
- //############### OFFSET 2 #######################
-
- addi T1, T1, 2*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
- lxvdsx vs5, o8, T1
-
- addi T1, T1, 16
-
- xvmuldp vs34, vs34, vs0
- xvnmsubadp vs35, vs34, vs1
- xvnmsubadp vs36, vs34, vs2
- xvnmsubadp vs37, vs34, vs3
- xvnmsubadp vs38, vs34, vs4
- xvnmsubadp vs39, vs34, vs5
-
- //############### OFFSET 3 #######################
-
- addi T1, T1, 3*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxvdsx vs4, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs35, vs35, vs0
- xvnmsubadp vs36, vs35, vs1
- xvnmsubadp vs37, vs35, vs2
- xvnmsubadp vs38, vs35, vs3
- xvnmsubadp vs39, vs35, vs4
-
- //############### OFFSET 4 #######################
-
- addi T1, T1, 4*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvmuldp vs36, vs36, vs0
- xvnmsubadp vs37, vs36, vs1
- xvnmsubadp vs38, vs36, vs2
- xvnmsubadp vs39, vs36, vs3
-
- //############### OFFSET 5 #######################
-
- addi T1, T1, 5*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
-
- addi T1, T1, 24
-
- xvmuldp vs37, vs37, vs0
- xvnmsubadp vs38, vs37, vs1
- xvnmsubadp vs39, vs37, vs2
-
- //############### OFFSET 6 #######################
-
- addi T1, T1, 6*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
-
- addi T1, T1, 16
-
- xvmuldp vs38, vs38, vs0
- xvnmsubadp vs39, vs38, vs1
-
- //############### OFFSET 7 #######################
-
- addi T1, T1, 7*SIZE
-
- lxvdsx vs0, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs39, vs39, vs0
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxvd2x vs32, o0, T1
- stxvd2x vs33, o16, T1
- stxvd2x vs34, o32, T1
- stxvd2x vs35, o48, T1
-
- addi T1, T1, 64
-
- stxvd2x vs36, o0, T1
- stxvd2x vs37, o16, T1
- stxvd2x vs38, o32, T1
- stxvd2x vs39, o48, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
- add T2, CO, LDC
-
-
- stxsdx vs32, o0, T1
- XXSWAPD(vs32,vs32)
- stxsdx vs33, o8, T1
- XXSWAPD(vs33,vs33)
- stxsdx vs34, o16, T1
- XXSWAPD(vs34,vs34)
- stxsdx vs35, o24, T1
- XXSWAPD(vs35,vs35)
-
- addi T1, T1, 32
-
- stxsdx vs36, o0, T1
- XXSWAPD(vs36,vs36)
- stxsdx vs37, o8, T1
- XXSWAPD(vs37,vs37)
- stxsdx vs38, o16, T1
- XXSWAPD(vs38,vs38)
- stxsdx vs39, o24, T1
- XXSWAPD(vs39,vs39)
-
- stxsdx vs32, o0, T2
- stxsdx vs33, o8, T2
- stxsdx vs34, o16, T2
- stxsdx vs35, o24, T2
-
- addi T2, T2, 32
-
- stxsdx vs36, o0, T2
- stxsdx vs37, o8, T2
- stxsdx vs38, o16, T2
- stxsdx vs39, o24, T2
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 4x2
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_4x2', `
- #else
- .macro SOLVE_LT_4x2
- #endif
-
- xxpermdi vs0, vs32, vs33, 0
- xxpermdi vs1, vs32, vs33, 3
-
- xxpermdi vs2, vs34, vs35, 0
- xxpermdi vs3, vs34, vs35, 3
-
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxvd2x vs32, o0, T1
- lxvd2x vs33, o16, T1
- lxvd2x vs34, o32, T1
- lxvd2x vs35, o48, T1
-
- xvsubdp vs32, vs32, vs0
- xvsubdp vs33, vs33, vs1
- xvsubdp vs34, vs34, vs2
- xvsubdp vs35, vs35, vs3
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
- lxvdsx vs3, o24, T1
-
- addi T1, T1, 32
-
- xvmuldp vs32, vs32, vs0
- xvnmsubadp vs33, vs32, vs1
- xvnmsubadp vs34, vs32, vs2
- xvnmsubadp vs35, vs32, vs3
-
- //############### OFFSET 1 #######################
-
- addi T1, T1, 1*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
- lxvdsx vs2, o16, T1
-
- addi T1, T1, 24
-
- xvmuldp vs33, vs33, vs0
- xvnmsubadp vs34, vs33, vs1
- xvnmsubadp vs35, vs33, vs2
-
- //############### OFFSET 2 #######################
-
- addi T1, T1, 2*SIZE
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
-
- addi T1, T1, 16
-
- xvmuldp vs34, vs34, vs0
- xvnmsubadp vs35, vs34, vs1
-
- //############### OFFSET 3 #######################
-
- addi T1, T1, 3*SIZE
-
- lxvdsx vs0, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs35, vs35, vs0
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxvd2x vs32, o0, T1
- stxvd2x vs33, o16, T1
- stxvd2x vs34, o32, T1
- stxvd2x vs35, o48, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
- add T2, CO, LDC
-
-
- stxsdx vs32, o0, T1
- XXSWAPD(vs32,vs32)
- stxsdx vs33, o8, T1
- XXSWAPD(vs33,vs33)
- stxsdx vs34, o16, T1
- XXSWAPD(vs34,vs34)
- stxsdx vs35, o24, T1
- XXSWAPD(vs35,vs35)
-
- stxsdx vs32, o0, T2
- stxsdx vs33, o8, T2
- stxsdx vs34, o16, T2
- stxsdx vs35, o24, T2
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 2x2
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_2x2', `
- #else
- .macro SOLVE_LT_2x2
- #endif
-
- xxpermdi vs0, vs32, vs33, 0
- xxpermdi vs1, vs32, vs33, 3
-
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxvd2x vs32, o0, T1
- lxvd2x vs33, o16, T1
-
- xvsubdp vs32, vs32, vs0
- xvsubdp vs33, vs33, vs1
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxvdsx vs0, o0, T1
- lxvdsx vs1, o8, T1
-
- addi T1, T1, 16
-
- xvmuldp vs32, vs32, vs0
- xvnmsubadp vs33, vs32, vs1
-
- //############### OFFSET 1 #######################
-
- addi T1, T1, 1*SIZE
-
- lxvdsx vs0, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs33, vs33, vs0
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxvd2x vs32, o0, T1
- stxvd2x vs33, o16, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
- add T2, CO, LDC
-
-
- stxsdx vs32, o0, T1
- XXSWAPD(vs32,vs32)
- stxsdx vs33, o8, T1
- XXSWAPD(vs33,vs33)
-
- stxsdx vs32, o0, T2
- stxsdx vs33, o8, T2
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 1x2
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_1x2', `
- #else
- .macro SOLVE_LT_1x2
- #endif
-
- xxpermdi vs0, vs32, vs33, 0
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxvd2x vs32, o0, T1
-
- xvsubdp vs32, vs32, vs0
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxvdsx vs0, o0, T1
-
- addi T1, T1, 8
-
- xvmuldp vs32, vs32, vs0
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxvd2x vs32, o0, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
- add T2, CO, LDC
-
-
- stxsdx vs32, o0, T1
- XXSWAPD(vs32,vs32)
-
- stxsdx vs32, o0, T2
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_16x1', `
- #else
- .macro INIT_16x1
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
- XVMOVDP(vs34,vs0)
- XVMOVDP(vs35,vs0)
- XVMOVDP(vs36,vs0)
- XVMOVDP(vs37,vs0)
- XVMOVDP(vs38,vs0)
- XVMOVDP(vs39,vs0)
- XVMOVDP(vs40,vs0)
- XVMOVDP(vs41,vs0)
- XVMOVDP(vs42,vs0)
- XVMOVDP(vs43,vs0)
- XVMOVDP(vs44,vs0)
- XVMOVDP(vs45,vs0)
- XVMOVDP(vs46,vs0)
- XVMOVDP(vs47,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_16x1', `
- #else
- .macro KERNEL_16x1
- #endif
-
-
- lxvdsx vs0, o0, AO
- lxvdsx vs1, o8, AO
- lxvdsx vs2, o16, AO
- lxvdsx vs3, o24, AO
-
- addi AO, AO, 32
-
- lxvdsx vs4, o0, AO
- lxvdsx vs5, o8, AO
- lxvdsx vs6, o16, AO
- lxvdsx vs7, o24, AO
-
- addi AO, AO, 32
-
- lxvdsx vs8, o0, AO
- lxvdsx vs9, o8, AO
- lxvdsx vs10, o16, AO
- lxvdsx vs11, o24, AO
-
- addi AO, AO, 32
-
- lxvdsx vs12, o0, AO
- lxvdsx vs13, o8, AO
- lxvdsx vs14, o16, AO
- lxvdsx vs15, o24, AO
-
- addi AO, AO, 32
-
- lxvdsx vs16, o0, BO
-
- addi BO, BO, 8
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs1, vs16
- xvmaddadp vs34, vs2, vs16
- xvmaddadp vs35, vs3, vs16
- xvmaddadp vs36, vs4, vs16
- xvmaddadp vs37, vs5, vs16
- xvmaddadp vs38, vs6, vs16
- xvmaddadp vs39, vs7, vs16
- xvmaddadp vs40, vs8, vs16
- xvmaddadp vs41, vs9, vs16
- xvmaddadp vs42, vs10, vs16
- xvmaddadp vs43, vs11, vs16
- xvmaddadp vs44, vs12, vs16
- xvmaddadp vs45, vs13, vs16
- xvmaddadp vs46, vs14, vs16
- xvmaddadp vs47, vs15, vs16
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_8x1', `
- #else
- .macro INIT_8x1
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
- XVMOVDP(vs34,vs0)
- XVMOVDP(vs35,vs0)
- XVMOVDP(vs36,vs0)
- XVMOVDP(vs37,vs0)
- XVMOVDP(vs38,vs0)
- XVMOVDP(vs39,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_8x1', `
- #else
- .macro KERNEL_8x1
- #endif
-
-
- lxvdsx vs0, o0, AO
- lxvdsx vs1, o8, AO
- lxvdsx vs2, o16, AO
- lxvdsx vs3, o24, AO
-
- addi AO, AO, 32
-
- lxvdsx vs4, o0, AO
- lxvdsx vs5, o8, AO
- lxvdsx vs6, o16, AO
- lxvdsx vs7, o24, AO
-
- addi AO, AO, 32
-
- lxvdsx vs16, o0, BO
-
- addi BO, BO, 8
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs1, vs16
- xvmaddadp vs34, vs2, vs16
- xvmaddadp vs35, vs3, vs16
- xvmaddadp vs36, vs4, vs16
- xvmaddadp vs37, vs5, vs16
- xvmaddadp vs38, vs6, vs16
- xvmaddadp vs39, vs7, vs16
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_4x1', `
- #else
- .macro INIT_4x1
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
- XVMOVDP(vs34,vs0)
- XVMOVDP(vs35,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_4x1', `
- #else
- .macro KERNEL_4x1
- #endif
-
-
- lxvdsx vs0, o0, AO
- lxvdsx vs1, o8, AO
- lxvdsx vs2, o16, AO
- lxvdsx vs3, o24, AO
-
- addi AO, AO, 32
-
- lxvdsx vs16, o0, BO
-
- addi BO, BO, 8
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs1, vs16
- xvmaddadp vs34, vs2, vs16
- xvmaddadp vs35, vs3, vs16
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_2x1', `
- #else
- .macro INIT_2x1
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
- XVMOVDP(vs33,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_2x1', `
- #else
- .macro KERNEL_2x1
- #endif
-
-
- lxvdsx vs0, o0, AO
- lxvdsx vs1, o8, AO
-
- addi AO, AO, 16
-
- lxvdsx vs16, o0, BO
-
- addi BO, BO, 8
-
- xvmaddadp vs32, vs0, vs16
- xvmaddadp vs33, vs1, vs16
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`INIT_1x1', `
- #else
- .macro INIT_1x1
- #endif
-
-
- xxlxor vs0, vs0, vs0
-
- XVMOVDP(vs32,vs0)
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- #if defined(_AIX)
- define(`KERNEL_1x1', `
- #else
- .macro KERNEL_1x1
- #endif
-
-
- lxvdsx vs0, o0, AO
-
- addi AO, AO, 8
-
- lxvdsx vs16, o0, BO
-
- addi BO, BO, 8
-
- xvmaddadp vs32, vs0, vs16
-
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 16x1
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_16x1', `
- #else
- .macro SOLVE_LT_16x1
- #endif
-
- XXSWAPD(vs0,vs32)
- XXSWAPD(vs1,vs33)
- XXSWAPD(vs2,vs34)
- XXSWAPD(vs3,vs35)
- XXSWAPD(vs4,vs36)
- XXSWAPD(vs5,vs37)
- XXSWAPD(vs6,vs38)
- XXSWAPD(vs7,vs39)
- XXSWAPD(vs8,vs40)
- XXSWAPD(vs9,vs41)
- XXSWAPD(vs10,vs42)
- XXSWAPD(vs11,vs43)
- XXSWAPD(vs12,vs44)
- XXSWAPD(vs13,vs45)
- XXSWAPD(vs14,vs46)
- XXSWAPD(vs15,vs47)
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxsdx vs32, o0, T1
- lxsdx vs33, o8, T1
- lxsdx vs34, o16, T1
- lxsdx vs35, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs36, o0, T1
- lxsdx vs37, o8, T1
- lxsdx vs38, o16, T1
- lxsdx vs39, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs40, o0, T1
- lxsdx vs41, o8, T1
- lxsdx vs42, o16, T1
- lxsdx vs43, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs44, o0, T1
- lxsdx vs45, o8, T1
- lxsdx vs46, o16, T1
- lxsdx vs47, o24, T1
-
- xssubdp vs32, vs32, vs0
- xssubdp vs33, vs33, vs1
- xssubdp vs34, vs34, vs2
- xssubdp vs35, vs35, vs3
- xssubdp vs36, vs36, vs4
- xssubdp vs37, vs37, vs5
- xssubdp vs38, vs38, vs6
- xssubdp vs39, vs39, vs7
- xssubdp vs40, vs40, vs8
- xssubdp vs41, vs41, vs9
- xssubdp vs42, vs42, vs10
- xssubdp vs43, vs43, vs11
- xssubdp vs44, vs44, vs12
- xssubdp vs45, vs45, vs13
- xssubdp vs46, vs46, vs14
- xssubdp vs47, vs47, vs15
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
- lxsdx vs6, o16, T1
- lxsdx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs8, o0, T1
- lxsdx vs9, o8, T1
- lxsdx vs10, o16, T1
- lxsdx vs11, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs12, o0, T1
- lxsdx vs13, o8, T1
- lxsdx vs14, o16, T1
- lxsdx vs15, o24, T1
-
- addi T1, T1, 32
-
- xsmuldp vs32, vs32, vs0
- xsnmsubadp vs33, vs32, vs1
- xsnmsubadp vs34, vs32, vs2
- xsnmsubadp vs35, vs32, vs3
- xsnmsubadp vs36, vs32, vs4
- xsnmsubadp vs37, vs32, vs5
- xsnmsubadp vs38, vs32, vs6
- xsnmsubadp vs39, vs32, vs7
- xsnmsubadp vs40, vs32, vs8
- xsnmsubadp vs41, vs32, vs9
- xsnmsubadp vs42, vs32, vs10
- xsnmsubadp vs43, vs32, vs11
- xsnmsubadp vs44, vs32, vs12
- xsnmsubadp vs45, vs32, vs13
- xsnmsubadp vs46, vs32, vs14
- xsnmsubadp vs47, vs32, vs15
-
- //############### OFFSET 1 #######################
-
- addi T1, T1, 1*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
- lxsdx vs6, o16, T1
- lxsdx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs8, o0, T1
- lxsdx vs9, o8, T1
- lxsdx vs10, o16, T1
- lxsdx vs11, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs12, o0, T1
- lxsdx vs13, o8, T1
- lxsdx vs14, o16, T1
-
- addi T1, T1, 24
-
- xsmuldp vs33, vs33, vs0
- xsnmsubadp vs34, vs33, vs1
- xsnmsubadp vs35, vs33, vs2
- xsnmsubadp vs36, vs33, vs3
- xsnmsubadp vs37, vs33, vs4
- xsnmsubadp vs38, vs33, vs5
- xsnmsubadp vs39, vs33, vs6
- xsnmsubadp vs40, vs33, vs7
- xsnmsubadp vs41, vs33, vs8
- xsnmsubadp vs42, vs33, vs9
- xsnmsubadp vs43, vs33, vs10
- xsnmsubadp vs44, vs33, vs11
- xsnmsubadp vs45, vs33, vs12
- xsnmsubadp vs46, vs33, vs13
- xsnmsubadp vs47, vs33, vs14
-
- //############### OFFSET 2 #######################
-
- addi T1, T1, 2*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
- lxsdx vs6, o16, T1
- lxsdx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs8, o0, T1
- lxsdx vs9, o8, T1
- lxsdx vs10, o16, T1
- lxsdx vs11, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs12, o0, T1
- lxsdx vs13, o8, T1
-
- addi T1, T1, 16
-
- xsmuldp vs34, vs34, vs0
- xsnmsubadp vs35, vs34, vs1
- xsnmsubadp vs36, vs34, vs2
- xsnmsubadp vs37, vs34, vs3
- xsnmsubadp vs38, vs34, vs4
- xsnmsubadp vs39, vs34, vs5
- xsnmsubadp vs40, vs34, vs6
- xsnmsubadp vs41, vs34, vs7
- xsnmsubadp vs42, vs34, vs8
- xsnmsubadp vs43, vs34, vs9
- xsnmsubadp vs44, vs34, vs10
- xsnmsubadp vs45, vs34, vs11
- xsnmsubadp vs46, vs34, vs12
- xsnmsubadp vs47, vs34, vs13
-
- //############### OFFSET 3 #######################
-
- addi T1, T1, 3*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
- lxsdx vs6, o16, T1
- lxsdx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs8, o0, T1
- lxsdx vs9, o8, T1
- lxsdx vs10, o16, T1
- lxsdx vs11, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs12, o0, T1
-
- addi T1, T1, 8
-
- xsmuldp vs35, vs35, vs0
- xsnmsubadp vs36, vs35, vs1
- xsnmsubadp vs37, vs35, vs2
- xsnmsubadp vs38, vs35, vs3
- xsnmsubadp vs39, vs35, vs4
- xsnmsubadp vs40, vs35, vs5
- xsnmsubadp vs41, vs35, vs6
- xsnmsubadp vs42, vs35, vs7
- xsnmsubadp vs43, vs35, vs8
- xsnmsubadp vs44, vs35, vs9
- xsnmsubadp vs45, vs35, vs10
- xsnmsubadp vs46, vs35, vs11
- xsnmsubadp vs47, vs35, vs12
-
- //############### OFFSET 4 #######################
-
- addi T1, T1, 4*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
- lxsdx vs6, o16, T1
- lxsdx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs8, o0, T1
- lxsdx vs9, o8, T1
- lxsdx vs10, o16, T1
- lxsdx vs11, o24, T1
-
- addi T1, T1, 32
-
- xsmuldp vs36, vs36, vs0
- xsnmsubadp vs37, vs36, vs1
- xsnmsubadp vs38, vs36, vs2
- xsnmsubadp vs39, vs36, vs3
- xsnmsubadp vs40, vs36, vs4
- xsnmsubadp vs41, vs36, vs5
- xsnmsubadp vs42, vs36, vs6
- xsnmsubadp vs43, vs36, vs7
- xsnmsubadp vs44, vs36, vs8
- xsnmsubadp vs45, vs36, vs9
- xsnmsubadp vs46, vs36, vs10
- xsnmsubadp vs47, vs36, vs11
-
- //############### OFFSET 5 #######################
-
- addi T1, T1, 5*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
- lxsdx vs6, o16, T1
- lxsdx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs8, o0, T1
- lxsdx vs9, o8, T1
- lxsdx vs10, o16, T1
-
- addi T1, T1, 24
-
- xsmuldp vs37, vs37, vs0
- xsnmsubadp vs38, vs37, vs1
- xsnmsubadp vs39, vs37, vs2
- xsnmsubadp vs40, vs37, vs3
- xsnmsubadp vs41, vs37, vs4
- xsnmsubadp vs42, vs37, vs5
- xsnmsubadp vs43, vs37, vs6
- xsnmsubadp vs44, vs37, vs7
- xsnmsubadp vs45, vs37, vs8
- xsnmsubadp vs46, vs37, vs9
- xsnmsubadp vs47, vs37, vs10
-
- //############### OFFSET 6 #######################
-
- addi T1, T1, 6*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
- lxsdx vs6, o16, T1
- lxsdx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs8, o0, T1
- lxsdx vs9, o8, T1
-
- addi T1, T1, 16
-
- xsmuldp vs38, vs38, vs0
- xsnmsubadp vs39, vs38, vs1
- xsnmsubadp vs40, vs38, vs2
- xsnmsubadp vs41, vs38, vs3
- xsnmsubadp vs42, vs38, vs4
- xsnmsubadp vs43, vs38, vs5
- xsnmsubadp vs44, vs38, vs6
- xsnmsubadp vs45, vs38, vs7
- xsnmsubadp vs46, vs38, vs8
- xsnmsubadp vs47, vs38, vs9
-
- //############### OFFSET 7 #######################
-
- addi T1, T1, 7*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
- lxsdx vs6, o16, T1
- lxsdx vs7, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs8, o0, T1
-
- addi T1, T1, 8
-
- xsmuldp vs39, vs39, vs0
- xsnmsubadp vs40, vs39, vs1
- xsnmsubadp vs41, vs39, vs2
- xsnmsubadp vs42, vs39, vs3
- xsnmsubadp vs43, vs39, vs4
- xsnmsubadp vs44, vs39, vs5
- xsnmsubadp vs45, vs39, vs6
- xsnmsubadp vs46, vs39, vs7
- xsnmsubadp vs47, vs39, vs8
-
- //############### OFFSET 8 #######################
-
- addi T1, T1, 8*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
- lxsdx vs6, o16, T1
- lxsdx vs7, o24, T1
-
- addi T1, T1, 32
-
- xsmuldp vs40, vs40, vs0
- xsnmsubadp vs41, vs40, vs1
- xsnmsubadp vs42, vs40, vs2
- xsnmsubadp vs43, vs40, vs3
- xsnmsubadp vs44, vs40, vs4
- xsnmsubadp vs45, vs40, vs5
- xsnmsubadp vs46, vs40, vs6
- xsnmsubadp vs47, vs40, vs7
-
- //############### OFFSET 9 #######################
-
- addi T1, T1, 9*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
- lxsdx vs6, o16, T1
-
- addi T1, T1, 24
-
- xsmuldp vs41, vs41, vs0
- xsnmsubadp vs42, vs41, vs1
- xsnmsubadp vs43, vs41, vs2
- xsnmsubadp vs44, vs41, vs3
- xsnmsubadp vs45, vs41, vs4
- xsnmsubadp vs46, vs41, vs5
- xsnmsubadp vs47, vs41, vs6
-
- //############### OFFSET 10 #######################
-
- addi T1, T1, 10*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
-
- addi T1, T1, 16
-
- xsmuldp vs42, vs42, vs0
- xsnmsubadp vs43, vs42, vs1
- xsnmsubadp vs44, vs42, vs2
- xsnmsubadp vs45, vs42, vs3
- xsnmsubadp vs46, vs42, vs4
- xsnmsubadp vs47, vs42, vs5
-
- //############### OFFSET 11 #######################
-
- addi T1, T1, 11*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
-
- addi T1, T1, 8
-
- xsmuldp vs43, vs43, vs0
- xsnmsubadp vs44, vs43, vs1
- xsnmsubadp vs45, vs43, vs2
- xsnmsubadp vs46, vs43, vs3
- xsnmsubadp vs47, vs43, vs4
-
- //############### OFFSET 12 #######################
-
- addi T1, T1, 12*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- xsmuldp vs44, vs44, vs0
- xsnmsubadp vs45, vs44, vs1
- xsnmsubadp vs46, vs44, vs2
- xsnmsubadp vs47, vs44, vs3
-
- //############### OFFSET 13 #######################
-
- addi T1, T1, 13*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
-
- addi T1, T1, 24
-
- xsmuldp vs45, vs45, vs0
- xsnmsubadp vs46, vs45, vs1
- xsnmsubadp vs47, vs45, vs2
-
- //############### OFFSET 14 #######################
-
- addi T1, T1, 14*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
-
- addi T1, T1, 16
-
- xsmuldp vs46, vs46, vs0
- xsnmsubadp vs47, vs46, vs1
-
- //############### OFFSET 15 #######################
-
- addi T1, T1, 15*SIZE
-
- lxsdx vs0, o0, T1
-
- addi T1, T1, 8
-
- xsmuldp vs47, vs47, vs0
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxsdx vs32, o0, T1
- stxsdx vs33, o8, T1
- stxsdx vs34, o16, T1
- stxsdx vs35, o24, T1
-
- addi T1, T1, 32
-
- stxsdx vs36, o0, T1
- stxsdx vs37, o8, T1
- stxsdx vs38, o16, T1
- stxsdx vs39, o24, T1
-
- addi T1, T1, 32
-
- stxsdx vs40, o0, T1
- stxsdx vs41, o8, T1
- stxsdx vs42, o16, T1
- stxsdx vs43, o24, T1
-
- addi T1, T1, 32
-
- stxsdx vs44, o0, T1
- stxsdx vs45, o8, T1
- stxsdx vs46, o16, T1
- stxsdx vs47, o24, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
-
- stxsdx vs32, o0, T1
- stxsdx vs33, o8, T1
- stxsdx vs34, o16, T1
- stxsdx vs35, o24, T1
-
- addi T1, T1, 32
-
- stxsdx vs36, o0, T1
- stxsdx vs37, o8, T1
- stxsdx vs38, o16, T1
- stxsdx vs39, o24, T1
-
- addi T1, T1, 32
-
- stxsdx vs40, o0, T1
- stxsdx vs41, o8, T1
- stxsdx vs42, o16, T1
- stxsdx vs43, o24, T1
-
- addi T1, T1, 32
-
- stxsdx vs44, o0, T1
- stxsdx vs45, o8, T1
- stxsdx vs46, o16, T1
- stxsdx vs47, o24, T1
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 8x1
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_8x1', `
- #else
- .macro SOLVE_LT_8x1
- #endif
-
- XXSWAPD(vs0,vs32)
- XXSWAPD(vs1,vs33)
- XXSWAPD(vs2,vs34)
- XXSWAPD(vs3,vs35)
- XXSWAPD(vs4,vs36)
- XXSWAPD(vs5,vs37)
- XXSWAPD(vs6,vs38)
- XXSWAPD(vs7,vs39)
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxsdx vs32, o0, T1
- lxsdx vs33, o8, T1
- lxsdx vs34, o16, T1
- lxsdx vs35, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs36, o0, T1
- lxsdx vs37, o8, T1
- lxsdx vs38, o16, T1
- lxsdx vs39, o24, T1
-
- xssubdp vs32, vs32, vs0
- xssubdp vs33, vs33, vs1
- xssubdp vs34, vs34, vs2
- xssubdp vs35, vs35, vs3
- xssubdp vs36, vs36, vs4
- xssubdp vs37, vs37, vs5
- xssubdp vs38, vs38, vs6
- xssubdp vs39, vs39, vs7
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
- lxsdx vs6, o16, T1
- lxsdx vs7, o24, T1
-
- addi T1, T1, 32
-
- xsmuldp vs32, vs32, vs0
- xsnmsubadp vs33, vs32, vs1
- xsnmsubadp vs34, vs32, vs2
- xsnmsubadp vs35, vs32, vs3
- xsnmsubadp vs36, vs32, vs4
- xsnmsubadp vs37, vs32, vs5
- xsnmsubadp vs38, vs32, vs6
- xsnmsubadp vs39, vs32, vs7
-
- //############### OFFSET 1 #######################
-
- addi T1, T1, 1*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
- lxsdx vs6, o16, T1
-
- addi T1, T1, 24
-
- xsmuldp vs33, vs33, vs0
- xsnmsubadp vs34, vs33, vs1
- xsnmsubadp vs35, vs33, vs2
- xsnmsubadp vs36, vs33, vs3
- xsnmsubadp vs37, vs33, vs4
- xsnmsubadp vs38, vs33, vs5
- xsnmsubadp vs39, vs33, vs6
-
- //############### OFFSET 2 #######################
-
- addi T1, T1, 2*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
- lxsdx vs5, o8, T1
-
- addi T1, T1, 16
-
- xsmuldp vs34, vs34, vs0
- xsnmsubadp vs35, vs34, vs1
- xsnmsubadp vs36, vs34, vs2
- xsnmsubadp vs37, vs34, vs3
- xsnmsubadp vs38, vs34, vs4
- xsnmsubadp vs39, vs34, vs5
-
- //############### OFFSET 3 #######################
-
- addi T1, T1, 3*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- lxsdx vs4, o0, T1
-
- addi T1, T1, 8
-
- xsmuldp vs35, vs35, vs0
- xsnmsubadp vs36, vs35, vs1
- xsnmsubadp vs37, vs35, vs2
- xsnmsubadp vs38, vs35, vs3
- xsnmsubadp vs39, vs35, vs4
-
- //############### OFFSET 4 #######################
-
- addi T1, T1, 4*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- xsmuldp vs36, vs36, vs0
- xsnmsubadp vs37, vs36, vs1
- xsnmsubadp vs38, vs36, vs2
- xsnmsubadp vs39, vs36, vs3
-
- //############### OFFSET 5 #######################
-
- addi T1, T1, 5*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
-
- addi T1, T1, 24
-
- xsmuldp vs37, vs37, vs0
- xsnmsubadp vs38, vs37, vs1
- xsnmsubadp vs39, vs37, vs2
-
- //############### OFFSET 6 #######################
-
- addi T1, T1, 6*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
-
- addi T1, T1, 16
-
- xsmuldp vs38, vs38, vs0
- xsnmsubadp vs39, vs38, vs1
-
- //############### OFFSET 7 #######################
-
- addi T1, T1, 7*SIZE
-
- lxsdx vs0, o0, T1
-
- addi T1, T1, 8
-
- xsmuldp vs39, vs39, vs0
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxsdx vs32, o0, T1
- stxsdx vs33, o8, T1
- stxsdx vs34, o16, T1
- stxsdx vs35, o24, T1
-
- addi T1, T1, 32
-
- stxsdx vs36, o0, T1
- stxsdx vs37, o8, T1
- stxsdx vs38, o16, T1
- stxsdx vs39, o24, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
-
- stxsdx vs32, o0, T1
- stxsdx vs33, o8, T1
- stxsdx vs34, o16, T1
- stxsdx vs35, o24, T1
-
- addi T1, T1, 32
-
- stxsdx vs36, o0, T1
- stxsdx vs37, o8, T1
- stxsdx vs38, o16, T1
- stxsdx vs39, o24, T1
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 4x1
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_4x1', `
- #else
- .macro SOLVE_LT_4x1
- #endif
-
- XXSWAPD(vs0,vs32)
- XXSWAPD(vs1,vs33)
- XXSWAPD(vs2,vs34)
- XXSWAPD(vs3,vs35)
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxsdx vs32, o0, T1
- lxsdx vs33, o8, T1
- lxsdx vs34, o16, T1
- lxsdx vs35, o24, T1
-
- xssubdp vs32, vs32, vs0
- xssubdp vs33, vs33, vs1
- xssubdp vs34, vs34, vs2
- xssubdp vs35, vs35, vs3
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
- lxsdx vs3, o24, T1
-
- addi T1, T1, 32
-
- xsmuldp vs32, vs32, vs0
- xsnmsubadp vs33, vs32, vs1
- xsnmsubadp vs34, vs32, vs2
- xsnmsubadp vs35, vs32, vs3
-
- //############### OFFSET 1 #######################
-
- addi T1, T1, 1*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
- lxsdx vs2, o16, T1
-
- addi T1, T1, 24
-
- xsmuldp vs33, vs33, vs0
- xsnmsubadp vs34, vs33, vs1
- xsnmsubadp vs35, vs33, vs2
-
- //############### OFFSET 2 #######################
-
- addi T1, T1, 2*SIZE
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
-
- addi T1, T1, 16
-
- xsmuldp vs34, vs34, vs0
- xsnmsubadp vs35, vs34, vs1
-
- //############### OFFSET 3 #######################
-
- addi T1, T1, 3*SIZE
-
- lxsdx vs0, o0, T1
-
- addi T1, T1, 8
-
- xsmuldp vs35, vs35, vs0
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxsdx vs32, o0, T1
- stxsdx vs33, o8, T1
- stxsdx vs34, o16, T1
- stxsdx vs35, o24, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
-
- stxsdx vs32, o0, T1
- stxsdx vs33, o8, T1
- stxsdx vs34, o16, T1
- stxsdx vs35, o24, T1
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 2x1
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_2x1', `
- #else
- .macro SOLVE_LT_2x1
- #endif
-
- XXSWAPD(vs0,vs32)
- XXSWAPD(vs1,vs33)
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxsdx vs32, o0, T1
- lxsdx vs33, o8, T1
-
- xssubdp vs32, vs32, vs0
- xssubdp vs33, vs33, vs1
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxsdx vs0, o0, T1
- lxsdx vs1, o8, T1
-
- addi T1, T1, 16
-
- xsmuldp vs32, vs32, vs0
- xsnmsubadp vs33, vs32, vs1
-
- //############### OFFSET 1 #######################
-
- addi T1, T1, 1*SIZE
-
- lxsdx vs0, o0, T1
-
- addi T1, T1, 8
-
- xsmuldp vs33, vs33, vs0
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxsdx vs32, o0, T1
- stxsdx vs33, o8, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
-
- stxsdx vs32, o0, T1
- stxsdx vs33, o8, T1
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
-
-
- /*##########################################################################################
- SOLVE_LT 1x1
- ##########################################################################################*/
-
- #if defined(_AIX)
- define(`SOLVE_LT_1x1', `
- #else
- .macro SOLVE_LT_1x1
- #endif
-
- XXSWAPD(vs0,vs32)
-
- //############### LOAD B #######################
-
-
- mr T1, BO
-
- lxsdx vs32, o0, T1
-
- xssubdp vs32, vs32, vs0
-
- mr T1, AO
-
-
- //############### OFFSET 0 #######################
-
- lxsdx vs0, o0, T1
-
- addi T1, T1, 8
-
- xsmuldp vs32, vs32, vs0
-
- //############### SAVE B #######################
-
-
- mr T1, BO
-
-
- stxsdx vs32, o0, T1
-
- //############### SAVE C #######################
-
-
- mr T1, CO
-
- stxsdx vs32, o0, T1
-
- #if defined(_AIX)
- ')
- #else
- .endm
- #endif
|