Big Endian Changes for Power10 kernelstags/v0.3.19
| @@ -16,6 +16,8 @@ else | |||
| HOSTARCH = $(ARCH) | |||
| endif | |||
| HAVE_GAS := $(shell as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null) | |||
| # Catch conflicting usage of ARCH in some BSD environments | |||
| ifeq ($(ARCH), amd64) | |||
| override ARCH=x86_64 | |||
| @@ -1,7 +1,6 @@ | |||
| ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | |||
| ifeq ($(HAVE_GAS), 1) | |||
| include $(KERNELDIR)/KERNEL.POWER8 | |||
| else | |||
| #SGEMM_BETA = ../generic/gemm_beta.c | |||
| #DGEMM_BETA = ../generic/gemm_beta.c | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| @@ -44,6 +43,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_power10.S | |||
| #CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| @@ -218,5 +218,4 @@ QCABS_KERNEL = ../generic/cabs.c | |||
| #Dump kernel | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| endif | |||
| @@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| #endif | |||
| const float *mvecp = mvec; | |||
| /* We have to load reverse mask for big endian. */ | |||
| /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; | |||
| #else | |||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
| #endif | |||
| long ytmp; | |||
| __asm__ | |||
| @@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| "xvmaddasp 38, 58, 33 \n\t" | |||
| "xvmaddasp 39, 59, 33 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 0(%4) \n\t" | |||
| "stxv 49, 16(%4) \n\t" | |||
| "stxv 50, 32(%4) \n\t" | |||
| "stxv 51, 48(%4) \n\t" | |||
| "stxv 34, 64(%4) \n\t" | |||
| "stxv 35, 80(%4) \n\t" | |||
| "stxv 38, 96(%4) \n\t" | |||
| "stxv 39, 112(%4) \n\t" | |||
| #else | |||
| "stxv 49, 0(%4) \n\t" | |||
| "stxv 48, 16(%4) \n\t" | |||
| "stxv 51, 32(%4) \n\t" | |||
| @@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| "stxv 34, 80(%4) \n\t" | |||
| "stxv 39, 96(%4) \n\t" | |||
| "stxv 38, 112(%4) \n\t" | |||
| #endif | |||
| "addi %4, %4, 128 \n\t" | |||
| "xxperm 52, 40, %x10 \n\t" // exchange real and imag part | |||
| @@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| "xvmaddasp 38, 58, 33 \n\t" | |||
| "xvmaddasp 39, 59, 33 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 0(%4) \n\t" | |||
| "stxv 49, 16(%4) \n\t" | |||
| "stxv 50, 32(%4) \n\t" | |||
| "stxv 51, 48(%4) \n\t" | |||
| "stxv 34, 64(%4) \n\t" | |||
| "stxv 35, 80(%4) \n\t" | |||
| "stxv 38, 96(%4) \n\t" | |||
| "stxv 39, 112(%4) \n\t" | |||
| #else | |||
| "stxv 49, 0(%4) \n\t" | |||
| "stxv 48, 16(%4) \n\t" | |||
| "stxv 51, 32(%4) \n\t" | |||
| @@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| "stxv 34, 80(%4) \n\t" | |||
| "stxv 39, 96(%4) \n\t" | |||
| "stxv 38, 112(%4) \n\t" | |||
| #endif | |||
| "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" | |||
| : | |||
| @@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 32, 0(%3) \n\t" | |||
| "stxv 33, 16(%3) \n\t" | |||
| "stxv 34, 32(%3) \n\t" | |||
| "stxv 35, 48(%3) \n\t" | |||
| "stxv 36, 64(%3) \n\t" | |||
| "stxv 37, 80(%3) \n\t" | |||
| "stxv 38, 96(%3) \n\t" | |||
| "stxv 39, 112(%3) \n\t" | |||
| #else | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| @@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "stxv 36, 80(%3) \n\t" | |||
| "stxv 39, 96(%3) \n\t" | |||
| "stxv 38, 112(%3) \n\t" | |||
| #endif | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 40, 128(%3) \n\t" | |||
| "stxv 41, 144(%3) \n\t" | |||
| "stxv 42, 160(%3) \n\t" | |||
| "stxv 43, 176(%3) \n\t" | |||
| "stxv 44, 192(%3) \n\t" | |||
| "stxv 45, 208(%3) \n\t" | |||
| "stxv 46, 224(%3) \n\t" | |||
| "stxv 47, 240(%3) \n\t" | |||
| #else | |||
| "stxv 41, 128(%3) \n\t" | |||
| "stxv 40, 144(%3) \n\t" | |||
| "stxv 43, 160(%3) \n\t" | |||
| @@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "stxv 44, 208(%3) \n\t" | |||
| "stxv 47, 224(%3) \n\t" | |||
| "stxv 46, 240(%3) \n\t" | |||
| #endif | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| @@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 32, 0(%3) \n\t" | |||
| "stxv 33, 16(%3) \n\t" | |||
| "stxv 34, 32(%3) \n\t" | |||
| "stxv 35, 48(%3) \n\t" | |||
| "stxv 36, 64(%3) \n\t" | |||
| "stxv 37, 80(%3) \n\t" | |||
| "stxv 38, 96(%3) \n\t" | |||
| "stxv 39, 112(%3) \n\t" | |||
| "stxv 40, 128(%3) \n\t" | |||
| "stxv 41, 144(%3) \n\t" | |||
| "stxv 42, 160(%3) \n\t" | |||
| "stxv 43, 176(%3) \n\t" | |||
| "stxv 44, 192(%3) \n\t" | |||
| "stxv 45, 208(%3) \n\t" | |||
| "stxv 46, 224(%3) \n\t" | |||
| "stxv 47, 240(%3) \n\t" | |||
| #else | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| @@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "stxv 44, 208(%3) \n\t" | |||
| "stxv 47, 224(%3) \n\t" | |||
| "stxv 46, 240(%3) \n\t" | |||
| #endif | |||
| "#n=%1 x=%4=%2 y=%0=%3" | |||
| : | |||
| "=m" (*y), | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #include "common.h" | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| #include "cdot_microk_power10.c" | |||
| #else | |||
| #ifndef HAVE_KERNEL_8 | |||
| @@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| if ((inc_x == 1) && (inc_y == 1)) { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| BLASLONG n1 = n & -16; | |||
| #else | |||
| BLASLONG n1 = n & -8; | |||
| @@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| static void cdot_kernel_8 (long n, float *x, float *y, float *dot) | |||
| { | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| __vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
| #else | |||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
| #endif | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| @@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot) | |||
| "xxswapd 33, 34 \n\t" | |||
| "xvaddsp 35, 35, 32 \n\t" | |||
| "xvaddsp 34, 34, 33 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xxpermdi 34, 35, 34, 0 \n\t" | |||
| #else | |||
| "xxpermdi 34, 34, 35, 2 \n\t" | |||
| #endif | |||
| "stxv 34, 0(%6) \n\t" | |||
| "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" | |||
| @@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "cgemm_macros_power10.S" | |||
| #if (_AIX) | |||
| .set perm_const1, 0x0405060700010203 | |||
| .set perm_const2, 0x0c0d0e0f08090a0b | |||
| .set save_permute_12, 0x1011121300010203 | |||
| .set save_permute_11, 0x18191a1b08090a0b | |||
| #else | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
| .equ save_permute_11, 0x0405060714151617 | |||
| #endif | |||
| #ifndef NEEDPARAM | |||
| @@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| #if (_AIX) | |||
| lis T2, (perm_const2>>48 & 0xFFFF) | |||
| lis T1, (perm_const1>>48 & 0xFFFF) | |||
| lis T3, (save_permute_12>>48 & 0xFFFF) | |||
| lis T4, (save_permute_11>>48 & 0xFFFF) | |||
| ori T2, T2, (perm_const2>>32 & 0xFFFF) | |||
| ori T1, T1, (perm_const1>>32 & 0xFFFF) | |||
| ori T3, T3, (save_permute_12>>32 & 0xFFFF) | |||
| ori T4, T4, (save_permute_11>>32 & 0xFFFF) | |||
| #else | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| #endif | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| #if (_AIX) | |||
| oris T2, T2, (perm_const2>>16 & 0xFFFF) | |||
| oris T1, T1, (perm_const1>>16 & 0xFFFF) | |||
| oris T3, T3, (save_permute_12>>16 & 0xFFFF) | |||
| oris T4, T4, (save_permute_11>>16 & 0xFFFF) | |||
| ori T2, T2, (perm_const2 & 0xFFFF) | |||
| ori T1, T1, (perm_const1 & 0xFFFF) | |||
| ori T3, T3, (save_permute_12 & 0xFFFF) | |||
| ori T4, T4, (save_permute_11 & 0xFFFF) | |||
| #else | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| @@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| #endif | |||
| li r0,0 | |||
| li PRE,512 | |||
| @@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .if \OffsetA != 0 | |||
| addi \AREG, \AREG, \OffsetA | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 3, 36, 34 | |||
| xvf32gerpp 2, 37, 34 | |||
| xvf32gerpp 1, 32, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| xvf32gerpp 7, 36, 35 | |||
| xvf32gerpp 6, 37, 35 | |||
| xvf32gerpp 5, 32, 35 | |||
| xvf32gerpp 4, 33, 35 | |||
| #else | |||
| xvf32gerpp 3, 36, 35 | |||
| xvf32gerpp 2, 37, 35 | |||
| xvf32gerpp 1, 32, 35 | |||
| @@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 6, 37, 34 | |||
| xvf32gerpp 5, 32, 34 | |||
| xvf32gerpp 4, 33, 34 | |||
| #endif | |||
| .endm | |||
| .macro LOAD4x8_2 | |||
| @@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| xvf32gerpp 3, 36, 34 | |||
| xvf32gerpp 2, 37, 34 | |||
| xvf32gerpp 1, 32, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| xvf32gerpp 7, 36, 35 | |||
| xvf32gerpp 6, 37, 35 | |||
| xvf32gerpp 5, 32, 35 | |||
| xvf32gerpp 4, 33, 35 | |||
| #else | |||
| xvf32gerpp 3, 36, 35 | |||
| xvf32gerpp 2, 37, 35 | |||
| xvf32gerpp 1, 32, 35 | |||
| @@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 6, 37, 34 | |||
| xvf32gerpp 5, 32, 34 | |||
| xvf32gerpp 4, 33, 34 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) | |||
| lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) | |||
| lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) | |||
| .endif | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| xvf32gerpp 3, 42, 38 | |||
| xvf32gerpp 2, 43, 38 | |||
| xvf32gerpp 1, 40, 38 | |||
| xvf32gerpp 0, 41, 38 | |||
| xvf32gerpp 7, 42, 39 | |||
| xvf32gerpp 6, 43, 39 | |||
| xvf32gerpp 5, 40, 39 | |||
| xvf32gerpp 4, 41, 39 | |||
| #else | |||
| xvf32gerpp 3, 42, 39 | |||
| xvf32gerpp 2, 43, 39 | |||
| xvf32gerpp 1, 40, 39 | |||
| @@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 6, 43, 38 | |||
| xvf32gerpp 5, 40, 38 | |||
| xvf32gerpp 4, 41, 38 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) | |||
| lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
| @@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR2 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs5, vs4, vs12, 1 | |||
| xxpermdi vs7, vs6, vs14, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs5, vs12, vs4, 2 | |||
| xxpermdi vs7, vs14, vs6, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| xxpermdi vs11, vs2, vs10, 2 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs3 | |||
| xvaddsp vs25, vs25, vs1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs13, vs12, vs4, 1 | |||
| xxpermdi vs15, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs13, vs4, vs12, 2 | |||
| xxpermdi vs15, vs6, vs14, 2 | |||
| #endif | |||
| xvaddsp vs26, vs26, vs7 | |||
| xvaddsp vs27, vs27, vs5 | |||
| xvaddsp vs28, vs28, vs11 | |||
| xvaddsp vs29, vs29, vs9 | |||
| xvaddsp vs30, vs30, vs15 | |||
| xvaddsp vs31, vs31, vs13 | |||
| #else | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| xxpermdi vs25, vs0, vs8, 1 | |||
| xxpermdi vs24, vs2, vs10, 1 | |||
| xxpermdi vs27, vs4, vs12, 1 | |||
| xxpermdi vs26, vs6, vs14, 1 | |||
| xxpermdi vs29, vs8, vs0, 1 | |||
| xxpermdi vs28, vs10, vs2, 1 | |||
| xxpermdi vs31, vs12, vs4, 1 | |||
| xxpermdi vs30, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs25, vs8, vs0, 2 | |||
| xxpermdi vs24, vs10, vs2, 2 | |||
| @@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs28, vs2, vs10, 2 | |||
| xxpermdi vs31, vs4, vs12, 2 | |||
| xxpermdi vs30, vs6, vs14, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs24, 0(CO) | |||
| MULT_APLHA_PART1 vs48, vs56, vs0, vs1 | |||
| @@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR2 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs5, vs4, vs12, 1 | |||
| xxpermdi vs7, vs6, vs14, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs5, vs12, vs4, 2 | |||
| xxpermdi vs7, vs14, vs6, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| xxpermdi vs11, vs2, vs10, 2 | |||
| #endif | |||
| xvaddsp vs32, vs32, vs3 | |||
| xvaddsp vs33, vs33, vs1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs13, vs12, vs4, 1 | |||
| xxpermdi vs15, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs13, vs4, vs12, 2 | |||
| xxpermdi vs15, vs6, vs14, 2 | |||
| #endif | |||
| xvaddsp vs40, vs40, vs7 | |||
| xvaddsp vs41, vs41, vs5 | |||
| xvaddsp vs34, vs34, vs11 | |||
| xvaddsp vs35, vs35, vs9 | |||
| xvaddsp vs42, vs42, vs15 | |||
| xvaddsp vs43, vs43, vs13 | |||
| #else | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| xxpermdi vs33, vs0, vs8, 1 | |||
| xxpermdi vs32, vs2, vs10, 1 | |||
| xxpermdi vs41, vs4, vs12, 1 | |||
| xxpermdi vs40, vs6, vs14, 1 | |||
| xxpermdi vs35, vs8, vs0, 1 | |||
| xxpermdi vs34, vs10, vs2, 1 | |||
| xxpermdi vs43, vs12, vs4, 1 | |||
| xxpermdi vs42, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs33, vs8, vs0, 2 | |||
| xxpermdi vs32, vs10, vs2, 2 | |||
| @@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs34, vs2, vs10, 2 | |||
| xxpermdi vs43, vs4, vs12, 2 | |||
| xxpermdi vs42, vs6, vs14, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs32, 0(T2) | |||
| stxvp vs40, 32(T2) | |||
| @@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .if \OffsetA != 0 | |||
| addi \AREG, \AREG, \OffsetA | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 3, 32, 35 | |||
| xvf32gerpp 2, 33, 35 | |||
| xvf32gerpp 1, 32, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| #else | |||
| xvf32gerpp 3, 32, 34 | |||
| xvf32gerpp 2, 33, 34 | |||
| xvf32gerpp 1, 32, 35 | |||
| xvf32gerpp 0, 33, 35 | |||
| #endif | |||
| .endm | |||
| .macro LOAD4x4_2 | |||
| @@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 3, 32, 35 | |||
| xvf32gerpp 2, 33, 35 | |||
| xvf32gerpp 1, 32, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| #else | |||
| xvf32gerpp 3, 32, 34 | |||
| xvf32gerpp 2, 33, 34 | |||
| xvf32gerpp 1, 32, 35 | |||
| xvf32gerpp 0, 33, 35 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) | |||
| lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 3, 36, 39 | |||
| xvf32gerpp 2, 37, 39 | |||
| xvf32gerpp 1, 36, 38 | |||
| xvf32gerpp 0, 37, 38 | |||
| #else | |||
| xvf32gerpp 3, 36, 38 | |||
| xvf32gerpp 2, 37, 38 | |||
| xvf32gerpp 1, 36, 39 | |||
| xvf32gerpp 0, 37, 39 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) | |||
| @@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR2 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| xxpermdi vs5, vs4, vs12, 1 | |||
| xxpermdi vs7, vs6, vs14, 1 | |||
| xxpermdi vs13, vs12, vs4, 1 | |||
| xxpermdi vs15, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| @@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs7, vs14, vs6, 2 | |||
| xxpermdi vs13, vs4, vs12, 2 | |||
| xxpermdi vs15, vs6, vs14, 2 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs3 | |||
| xvaddsp vs25, vs25, vs1 | |||
| xvaddsp vs26, vs26, vs11 | |||
| @@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvaddsp vs29, vs29, vs5 | |||
| xvaddsp vs30, vs30, vs15 | |||
| xvaddsp vs31, vs31, vs13 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs25, vs0, vs8, 1 | |||
| xxpermdi vs24, vs2, vs10, 1 | |||
| xxpermdi vs27, vs8, vs0, 1 | |||
| xxpermdi vs26, vs10, vs2, 1 | |||
| xxpermdi vs29, vs4, vs12, 1 | |||
| xxpermdi vs28, vs6, vs14, 1 | |||
| xxpermdi vs31, vs12, vs4, 1 | |||
| xxpermdi vs30, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs25, vs8, vs0, 2 | |||
| xxpermdi vs24, vs10, vs2, 2 | |||
| @@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs28, vs14, vs6, 2 | |||
| xxpermdi vs31, vs4, vs12, 2 | |||
| xxpermdi vs30, vs6, vs14, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs24, 0(CO) | |||
| stxvp vs26, 0(T1) | |||
| @@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .if \OffsetA != 0 | |||
| addi \AREG, \AREG, \OffsetA | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 1, 35, 32 | |||
| xvf32gerpp 0, 34, 32 | |||
| #else | |||
| xvf32gerpp 1, 34, 32 | |||
| xvf32gerpp 0, 35, 32 | |||
| #endif | |||
| .endm | |||
| .macro LOAD4x2_2 | |||
| @@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 1, 35, 32 | |||
| xvf32gerpp 0, 34, 32 | |||
| #else | |||
| xvf32gerpp 1, 34, 33 | |||
| xvf32gerpp 0, 35, 33 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 1, 37, 33 | |||
| xvf32gerpp 0, 36, 33 | |||
| #else | |||
| xvf32gerpp 1, 36, 32 | |||
| xvf32gerpp 0, 37, 32 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
| @@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR1 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 0 | |||
| xxpermdi vs9, vs2, vs10, 0 | |||
| xxpermdi vs3, vs8, vs0, 3 | |||
| xxpermdi vs11, vs10, vs2, 3 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 0 | |||
| xxpermdi vs9, vs10, vs2, 0 | |||
| xxpermdi vs3, vs0, vs8, 3 | |||
| xxpermdi vs11, vs2, vs10, 3 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs1 | |||
| xvaddsp vs26, vs26, vs9 | |||
| xvaddsp vs25, vs25, vs3 | |||
| xvaddsp vs27, vs27, vs11 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs24, vs0, vs8, 0 | |||
| xxpermdi vs26, vs2, vs10, 0 | |||
| xxpermdi vs25, vs8, vs0, 3 | |||
| xxpermdi vs27, vs10, vs2, 3 | |||
| #else | |||
| xxpermdi vs24, vs8, vs0, 0 | |||
| xxpermdi vs26, vs10, vs2, 0 | |||
| xxpermdi vs25, vs0, vs8, 3 | |||
| xxpermdi vs27, vs2, vs10, 3 | |||
| #endif | |||
| #endif | |||
| stxv vs24, 0(CO) | |||
| stxv vs25, 0(T1) | |||
| @@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .if \OffsetA != 0 | |||
| addi \AREG, \AREG, \OffsetA | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 34, 32 | |||
| xvf32gerpp 1, 35, 32 | |||
| #else | |||
| xvf32gerpp 0, 35, 32 | |||
| xvf32gerpp 1, 34, 32 | |||
| #endif | |||
| .endm | |||
| .macro LOAD4x1_2 | |||
| @@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD4x1_2O OffsetA, OffsetB | |||
| lxv vs32, (\OffsetA)(AO) | |||
| vspltisb v6, 0 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs33, vs32, vs38, 2 | |||
| xxpermdi vs32, vs32, vs38, 0 | |||
| #else | |||
| xxpermdi vs33, vs32, vs38, 0 | |||
| xxpermdi vs32, vs32, vs38, 2 | |||
| #endif | |||
| lxvp vs34, (0+\OffsetB)(BO) | |||
| lxvp vs36, (32+\OffsetB)(BO) | |||
| .endm | |||
| @@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 34, 32 | |||
| xvf32gerpp 1, 35, 32 | |||
| #else | |||
| xvf32gerpp 0, 35, 32 | |||
| xvf32gerpp 1, 34, 32 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 36, 33 | |||
| xvf32gerpp 1, 37, 33 | |||
| #else | |||
| xvf32gerpp 0, 37, 33 | |||
| xvf32gerpp 1, 36, 33 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxv vs32, DISP2(\Index, \OffsetA)(\AREG) | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs33, vs32, vs38, 2 | |||
| xxpermdi vs32, vs32, vs38, 0 | |||
| #else | |||
| xxpermdi vs33, vs32, vs38, 0 | |||
| xxpermdi vs32, vs32, vs38, 2 | |||
| #endif | |||
| .endif | |||
| .if \IsLast==1 | |||
| .if \Complete==1 | |||
| @@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 2, 37, 34 | |||
| xvf32gerpp 3, 36, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| xvf32gerpp 1, 32, 34 | |||
| #else | |||
| xvf32gerpp 2, 37, 35 | |||
| xvf32gerpp 3, 36, 35 | |||
| xvf32gerpp 0, 33, 35 | |||
| xvf32gerpp 1, 32, 35 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) | |||
| lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 2, 41, 35 | |||
| xvf32gerpp 3, 40, 35 | |||
| xvf32gerpp 0, 39, 35 | |||
| xvf32gerpp 1, 38, 35 | |||
| #else | |||
| xvf32gerpp 2, 41, 34 | |||
| xvf32gerpp 3, 40, 34 | |||
| xvf32gerpp 0, 39, 34 | |||
| xvf32gerpp 1, 38, 34 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) | |||
| @@ -1068,16 +1262,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR2 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs5, vs4, vs12, 1 | |||
| xxpermdi vs7, vs6, vs14, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs5, vs12, vs4, 2 | |||
| xxpermdi vs7, vs14, vs6, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| xxpermdi vs11, vs2, vs10, 2 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs3 | |||
| xvaddsp vs25, vs25, vs1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs13, vs12, vs4, 1 | |||
| xxpermdi vs15, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs13, vs4, vs12, 2 | |||
| xxpermdi vs15, vs6, vs14, 2 | |||
| #endif | |||
| xvaddsp vs26, vs26, vs7 | |||
| xvaddsp vs27, vs27, vs5 | |||
| xvaddsp vs28, vs28, vs11 | |||
| @@ -1085,6 +1293,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvaddsp vs30, vs30, vs15 | |||
| xvaddsp vs31, vs31, vs13 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs25, vs0, vs8, 1 | |||
| xxpermdi vs24, vs2, vs10, 1 | |||
| xxpermdi vs27, vs4, vs12, 1 | |||
| xxpermdi vs26, vs6, vs14, 1 | |||
| xxpermdi vs29, vs8, vs0, 1 | |||
| xxpermdi vs28, vs10, vs2, 1 | |||
| xxpermdi vs31, vs12, vs4, 1 | |||
| xxpermdi vs30, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs25, vs8, vs0, 2 | |||
| xxpermdi vs24, vs10, vs2, 2 | |||
| xxpermdi vs27, vs12, vs4, 2 | |||
| @@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs28, vs2, vs10, 2 | |||
| xxpermdi vs31, vs4, vs12, 2 | |||
| xxpermdi vs30, vs6, vs14, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs24, 0(CO) | |||
| stxvp vs26, 32(CO) | |||
| @@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 33, 34 | |||
| xvf32gerpp 1, 32, 34 | |||
| #else | |||
| xvf32gerpp 0, 33, 35 | |||
| xvf32gerpp 1, 32, 35 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 37, 35 | |||
| xvf32gerpp 1, 36, 35 | |||
| #else | |||
| xvf32gerpp 0, 37, 34 | |||
| xvf32gerpp 1, 36, 34 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) | |||
| @@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR1 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| xxpermdi vs11, vs2, vs10, 2 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs3 | |||
| xvaddsp vs25, vs25, vs1 | |||
| xvaddsp vs26, vs26, vs11 | |||
| xvaddsp vs27, vs27, vs9 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs25, vs0, vs8, 1 | |||
| xxpermdi vs24, vs2, vs10, 1 | |||
| xxpermdi vs27, vs8, vs0, 1 | |||
| xxpermdi vs26, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs25, vs8, vs0, 2 | |||
| xxpermdi vs24, vs10, vs2, 2 | |||
| xxpermdi vs27, vs0, vs8, 2 | |||
| xxpermdi vs26, vs2, vs10, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs24, 0(CO) | |||
| stxvp vs26, 0(T1) | |||
| @@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxperm vs8, vs9, save_permute_1 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 0 | |||
| xxpermdi vs9, vs8, vs0, 3 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 0 | |||
| xxpermdi vs9, vs0, vs8, 3 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs1 | |||
| xvaddsp vs26, vs26, vs9 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs24, vs0, vs8, 0 | |||
| xxpermdi vs26, vs8, vs0, 3 | |||
| #else | |||
| xxpermdi vs24, vs8, vs0, 0 | |||
| xxpermdi vs26, vs0, vs8, 3 | |||
| #endif | |||
| #endif | |||
| stxv vs24, 0(CO) | |||
| stxv vs26, 0(T1) | |||
| @@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs32, (0+\OffsetA)(AO) | |||
| lxvp vs36, (32+\OffsetA)(AO) | |||
| vspltisb v10, 0 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs35, vs34, vs42, 2 | |||
| xxpermdi vs34, vs34, vs42, 0 | |||
| #else | |||
| xxpermdi vs35, vs34, vs42, 0 | |||
| xxpermdi vs34, vs34, vs42, 2 | |||
| #endif | |||
| lxvp vs38, (64+\OffsetA)(AO) | |||
| lxvp vs40, (64+32+\OffsetA)(AO) | |||
| .endm | |||
| @@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 3, 35, 40 | |||
| .if \Complete==0 | |||
| lxv vs34, DISP2(\Index, \OffsetB)(\BREG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs35, vs34, vs42, 2 | |||
| xxpermdi vs34, vs34, vs42, 0 | |||
| #else | |||
| xxpermdi vs35, vs34, vs42, 0 | |||
| xxpermdi vs34, vs34, vs42, 2 | |||
| #endif | |||
| lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) | |||
| .endif | |||
| .if \IsLast==1 | |||
| @@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MULT_APLHA_PART2 vs34, vs42, vs4, vs5 | |||
| MULT_APLHA_PART2 vs35, vs43, vs6, vs7 | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxperm vs0, vs1, save_permute_1 | |||
| xxperm vs2, vs3, save_permute_1 | |||
| xxperm vs4, vs5, save_permute_1 | |||
| xxperm vs6, vs7, save_permute_1 | |||
| #else | |||
| xxperm vs0, vs1, vs28 | |||
| xxperm vs2, vs3, vs28 | |||
| xxperm vs4, vs5, vs28 | |||
| xxperm vs6, vs7, vs28 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| xvaddsp vs24, vs24, vs2 | |||
| @@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stxvp vs26, 32(CO) | |||
| #else | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| stxv vs2, 0(CO) | |||
| stxv vs0, 16(CO) | |||
| stxv vs6, 32(CO) | |||
| stxv vs4, 48(CO) | |||
| #else | |||
| stxv vs0, 0(CO) | |||
| stxv vs2, 16(CO) | |||
| stxv vs4, 32(CO) | |||
| stxv vs6, 48(CO) | |||
| #endif | |||
| #endif | |||
| addi CO, CO, 64 | |||
| .endm | |||
| @@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxv vs34, (\OffsetB)(BO) | |||
| lxvp vs32, (0+\OffsetA)(AO) | |||
| vspltisb v6, 0 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs35, vs34, vs38, 2 | |||
| xxpermdi vs34, vs34, vs38, 0 | |||
| #else | |||
| xxpermdi vs35, vs34, vs38, 0 | |||
| xxpermdi vs34, vs34, vs38, 2 | |||
| #endif | |||
| lxvp vs36, (32+\OffsetA)(AO) | |||
| .endm | |||
| @@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 1, 35, 36 | |||
| .if \Complete==0 | |||
| lxv vs34, DISP2(\Index, \OffsetB)(\BREG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs35, vs34, vs38, 2 | |||
| xxpermdi vs34, vs34, vs38, 0 | |||
| #else | |||
| xxpermdi vs35, vs34, vs38, 0 | |||
| xxpermdi vs34, vs34, vs38, 2 | |||
| #endif | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) | |||
| .endif | |||
| .if \IsLast==1 | |||
| @@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MULT_APLHA_PART2 vs32, vs40, vs0, vs1 | |||
| MULT_APLHA_PART2 vs33, vs41, vs2, vs3 | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxperm vs0, vs1, save_permute_1 | |||
| xxperm vs2, vs3, save_permute_1 | |||
| #else | |||
| xxperm vs0, vs1, vs28 | |||
| xxperm vs2, vs3, vs28 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| xvaddsp vs24, vs24, vs2 | |||
| @@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stxvp vs24, 0(CO) | |||
| #else | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| stxv vs2, 0(CO) | |||
| stxv vs0, 16(CO) | |||
| #else | |||
| stxv vs0, 0(CO) | |||
| stxv vs2, 16(CO) | |||
| #endif | |||
| #endif | |||
| addi CO, CO, 32 | |||
| .endm | |||
| @@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MULT_APLHA_PART1 vs32, vs40, vs0, vs1 | |||
| MULT_APLHA_PART2 vs32, vs40, vs0, vs1 | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxperm vs0, vs1, save_permute_1 | |||
| #else | |||
| xxperm vs0, vs1, vs28 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| xvaddsp vs24, vs24, vs0 | |||
| @@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MULT_APLHA_PART1 vs32, vs40, vs37, vs1 | |||
| MULT_APLHA_PART2 vs32, vs40, vs37, vs1 | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxperm vs37, vs1, save_permute_1 | |||
| #else | |||
| xxperm vs37, vs1, vs28 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| xvaddsp vs36, vs36, vs37 | |||
| @@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) | |||
| { | |||
| __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| __vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; | |||
| #else | |||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
| #endif | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| @@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "cswap_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "cswap_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "cswap_microk_power8.c" | |||
| #include "cswap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dasum_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "dasum_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "dasum_microk_power8.c" | |||
| #include "dasum_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) | |||
| @@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if ( inc_x == 1 ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y | |||
| XXSPLTD_S(32,%x9,0) // alpha, alpha | |||
| "sldi %6, %13, 3 \n\t" // lda * sizeof (double) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha | |||
| "xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha | |||
| #else | |||
| "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha | |||
| "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha | |||
| #endif | |||
| "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda | |||
| "add %6, %6, %6 \n\t" // 2 * lda | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha | |||
| XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha | |||
| #else | |||
| XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha | |||
| XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha | |||
| #endif | |||
| "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda | |||
| "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda | |||
| @@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda | |||
| "add %10, %10, %10 \n\t" // 2 * lda | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha | |||
| XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha | |||
| XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha | |||
| XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha | |||
| XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha | |||
| XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha | |||
| #else | |||
| XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha | |||
| @@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha | |||
| XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha | |||
| XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha | |||
| #endif | |||
| "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda | |||
| "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda | |||
| @@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| "one%=: \n\t" | |||
| "lxvp 36, 0( %2) \n\t" // y0, y1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 40, 32 \n\t" | |||
| "xvmaddadp 37, 41, 32 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 40, 34 \n\t" | |||
| "xvmaddadp 37, 41, 34 \n\t" | |||
| #endif | |||
| "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 42, 33 \n\t" | |||
| "xvmaddadp 37, 43, 33 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 42, 35 \n\t" | |||
| "xvmaddadp 37, 43, 35 \n\t" | |||
| #endif | |||
| "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 44, 34 \n\t" | |||
| "xvmaddadp 37, 45, 34 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 44, 32 \n\t" | |||
| "xvmaddadp 37, 45, 32 \n\t" | |||
| #endif | |||
| "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 46, 35 \n\t" | |||
| "xvmaddadp 37, 47, 35 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 46, 33 \n\t" | |||
| "xvmaddadp 37, 47, 33 \n\t" | |||
| #endif | |||
| "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 50, 38 \n\t" | |||
| "xvmaddadp 37, 51, 38 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 50, 48 \n\t" | |||
| "xvmaddadp 37, 51, 48 \n\t" | |||
| #endif | |||
| "lxvpx 50, %7, %11 \n\t" // a4[0] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 52, 39 \n\t" | |||
| "xvmaddadp 37, 53, 39 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 52, 49 \n\t" | |||
| "xvmaddadp 37, 53, 49 \n\t" | |||
| #endif | |||
| "lxvpx 52, %8, %11 \n\t" // a5[0] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 54, 48 \n\t" | |||
| "xvmaddadp 37, 55, 48 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 54, 38 \n\t" | |||
| "xvmaddadp 37, 55, 38 \n\t" | |||
| #endif | |||
| "lxvpx 54, %9, %11 \n\t" // a6[0] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 56, 49 \n\t" | |||
| "xvmaddadp 37, 57, 49 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 56, 39 \n\t" | |||
| "xvmaddadp 37, 57, 39 \n\t" | |||
| #endif | |||
| "lxvpx 56, %10, %11 \n\t" // a7[0] | |||
| "addi %11, %11, 32 \n\t" | |||
| @@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| "two%=: \n\t" | |||
| "lxvp 36, 0( %2) \n\t" // y0, y1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 40, 32 \n\t" | |||
| "xvmaddadp 37, 41, 32 \n\t" | |||
| "xvmaddadp 36, 42, 33 \n\t" | |||
| "xvmaddadp 37, 43, 33 \n\t" | |||
| "xvmaddadp 36, 44, 34 \n\t" | |||
| "xvmaddadp 37, 45, 34 \n\t" | |||
| "xvmaddadp 36, 46, 35 \n\t" | |||
| "xvmaddadp 37, 47, 35 \n\t" | |||
| "xvmaddadp 36, 50, 38 \n\t" | |||
| "xvmaddadp 37, 51, 38 \n\t" | |||
| "xvmaddadp 36, 52, 39 \n\t" | |||
| "xvmaddadp 37, 53, 39 \n\t" | |||
| "xvmaddadp 36, 54, 48 \n\t" | |||
| "xvmaddadp 37, 55, 48 \n\t" | |||
| "xvmaddadp 36, 56, 49 \n\t" | |||
| "xvmaddadp 37, 57, 49 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 40, 34 \n\t" | |||
| "xvmaddadp 37, 41, 34 \n\t" | |||
| "xvmaddadp 36, 42, 35 \n\t" | |||
| @@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| "xvmaddadp 37, 55, 38 \n\t" | |||
| "xvmaddadp 36, 56, 39 \n\t" | |||
| "xvmaddadp 37, 57, 39 \n\t" | |||
| #endif | |||
| "stxvp 36, 0( %2) \n\t" // y0, y1 | |||
| : | |||
| @@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do | |||
| "lxvp 40, 32(%[y]) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(42,34,35) | |||
| XXMRGLD_S(43,34,35) | |||
| XXMRGHD_S(44,4,5) | |||
| XXMRGLD_S(45,4,5) | |||
| #else | |||
| XXMRGLD_S(42,35,34) | |||
| XXMRGHD_S(43,35,34) | |||
| XXMRGLD_S(44,5,4) | |||
| XXMRGHD_S(45,5,4) | |||
| #endif | |||
| "xvadddp 42,42,43 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(46,6,7) | |||
| XXMRGLD_S(47,6,7) | |||
| #else | |||
| XXMRGLD_S(46,7,6) | |||
| XXMRGHD_S(47,7,6) | |||
| #endif | |||
| "xvadddp 44,44,45 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(48,8,9) | |||
| XXMRGLD_S(49,8,9) | |||
| #else | |||
| XXMRGLD_S(48,9,8) | |||
| XXMRGHD_S(49,9,8) | |||
| #endif | |||
| "xvadddp 46,46,47 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 38,42,36 \n\t" | |||
| "xvmaddadp 39,44,36 \n\t" | |||
| #else | |||
| "xvmaddadp 39,42,36 \n\t" | |||
| "xvmaddadp 38,44,36 \n\t" | |||
| #endif | |||
| "xvadddp 48,48,49 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 41,48,36 \n\t" | |||
| #else | |||
| "xvmaddadp 41,46,36 \n\t" | |||
| #endif | |||
| "stxvp 38, 0(%[y]) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 40,46,36 \n\t" | |||
| #else | |||
| "xvmaddadp 40,48,36 \n\t" | |||
| #endif | |||
| "stxvp 40, 32(%[y]) \n\t" | |||
| : [memy] "+m" (*(double (*)[8])y), | |||
| @@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "drot_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "drot_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "drot_microk_power8.c" | |||
| #include "drot_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -117,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dscal_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "dscal_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "dscal_microk_power8.c" | |||
| #include "dscal_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( da == 0.0 ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| else | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dswap_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "swap_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "dswap_microk_power8.c" | |||
| #include "swap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "sasum_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "sasum_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "sasum_microk_power8.c" | |||
| #include "sasum_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if ( inc_x == 1 ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "srot_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "srot_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "srot_microk_power8.c" | |||
| #include "srot_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "sscal_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "sscal_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "sscal_microk_power8.c" | |||
| #include "sscal_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( da == 0.0 ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| else | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "sswap_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "swap_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "sswap_microk_power8.c" | |||
| #include "swap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 64 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, | |||
| double alpha_r, double alpha_i) | |||
| { | |||
| #if !defined(CONJ) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| static const double mvec[2] = { -1.0, 1.0 }; | |||
| #else | |||
| static const double mvec[2] = { 1.0, -1.0 }; | |||
| #endif | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| static const double mvec[2] = { 1.0, -1.0 }; | |||
| #else | |||
| static const double mvec[2] = { -1.0, 1.0 }; | |||
| #endif | |||
| #endif | |||
| const double *mvecp = mvec; | |||
| @@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(linux) || defined(__FreeBSD__) | |||
| #if defined(linux) || defined(__FreeBSD__) || defined(_AIX) | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||
| #if (defined(linux) || defined(__FreeBSD__) || defined(_AIX)) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| @@ -41,23 +41,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifndef TRMMKERNEL | |||
| lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) | |||
| lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxmrghd \VS_OUT1,\VS_TEMP1,\VS_TEMP2 | |||
| xxmrgld \VS_OUT2,\VS_TEMP1,\VS_TEMP2 | |||
| #else | |||
| xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 | |||
| xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 | |||
| #endif | |||
| #endif | |||
| .endm | |||
| /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ | |||
| .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ | |||
| xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ | |||
| #else | |||
| xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ | |||
| xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ | |||
| #endif | |||
| .endm | |||
| /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ | |||
| .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ | |||
| xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ | |||
| #else | |||
| xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ | |||
| xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ | |||
| #endif | |||
| .endm | |||
| /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ | |||
| @@ -103,8 +118,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxmrghd \VSOUT1,\VSIN1,\VSIN2 | |||
| xxmrgld \VSOUT2,\VSIN1,\VSIN2 | |||
| #else | |||
| xxmrghd \VSOUT1,\VSIN2,\VSIN1 | |||
| xxmrgld \VSOUT2,\VSIN2,\VSIN1 | |||
| #endif | |||
| .endm | |||
| @@ -186,15 +206,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35 | |||
| #ifndef TRMMKERNEL | |||
| lxv vs50, (\LOFFSET)(\BASE_REG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxmrghd vs46,vs50,vs50 | |||
| xxmrgld vs47,vs50,vs50 | |||
| #else | |||
| xxmrgld vs46,vs50,vs50 | |||
| xxmrghd vs47,vs50,vs50 | |||
| #endif | |||
| #endif | |||
| RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37 | |||
| AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 | |||
| MULT_APLHA_PART1 vs34,vs36, vs46,vs47 | |||
| MULT_APLHA_PART2 vs34,vs36, vs46,vs47 | |||
| UNPACK_FOR_STORE vs46,vs47,vs39,vs41 | |||
| #if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| xxmrghd vs39,vs47,vs46 | |||
| #endif | |||
| stxv vs39, (\LOFFSET)(\BASE_REG) | |||
| .endm | |||
| @@ -232,6 +259,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A | |||
| lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A | |||
| lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs34, vs48 | |||
| xvf64gerpp 2, vs36, vs48 | |||
| xvf64gerpp 3, vs38, vs48 | |||
| xvf64gerpp 4, vs32, vs49 | |||
| xvf64gerpp 5, vs34, vs49 | |||
| xvf64gerpp 6, vs36, vs49 | |||
| xvf64gerpp 7, vs38, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs36, vs49 | |||
| @@ -240,11 +277,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf64gerpp 5, vs34, vs48 | |||
| xvf64gerpp 6, vs36, vs48 | |||
| xvf64gerpp 7, vs38, vs48 | |||
| #endif | |||
| lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A | |||
| lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A | |||
| lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A | |||
| lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A | |||
| lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs40, vs50 | |||
| xvf64gerpp 1, vs42, vs50 | |||
| xvf64gerpp 2, vs44, vs50 | |||
| xvf64gerpp 3, vs46, vs50 | |||
| xvf64gerpp 4, vs40, vs51 | |||
| xvf64gerpp 5, vs42, vs51 | |||
| xvf64gerpp 6, vs44, vs51 | |||
| xvf64gerpp 7, vs46, vs51 | |||
| #else | |||
| xvf64gerpp 0, vs40, vs51 | |||
| xvf64gerpp 1, vs42, vs51 | |||
| xvf64gerpp 2, vs44, vs51 | |||
| @@ -253,6 +301,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf64gerpp 5, vs42, vs50 | |||
| xvf64gerpp 6, vs44, vs50 | |||
| xvf64gerpp 7, vs46, vs50 | |||
| #endif | |||
| .if \IsLast==1 | |||
| addi AO, AO, DISP16(\Index,256) | |||
| addi BO, BO, DISP4(\Index,64) | |||
| @@ -261,6 +310,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD_END_2x8 OffsetA,OffsetB | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs34, vs48 | |||
| xvf64gerpp 2, vs36, vs48 | |||
| xvf64gerpp 3, vs38, vs48 | |||
| xvf64gerpp 4, vs32, vs49 | |||
| xvf64gerpp 5, vs34, vs49 | |||
| xvf64gerpp 6, vs36, vs49 | |||
| xvf64gerpp 7, vs38, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs36, vs49 | |||
| @@ -269,6 +328,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf64gerpp 5, vs34, vs48 | |||
| xvf64gerpp 6, vs36, vs48 | |||
| xvf64gerpp 7, vs38, vs48 | |||
| #endif | |||
| addi BO, BO, \OffsetB | |||
| addi AO, AO, \OffsetA | |||
| .endm | |||
| @@ -305,7 +365,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs45, vs12, vs13, 0b10 | |||
| xxpermdi vs46, vs14, vs15, 0b01 | |||
| xxpermdi vs47, vs14, vs15, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs0, vs32, vs32 | |||
| xxlor vs1, vs33, vs33 | |||
| xxlor vs2, vs34, vs34 | |||
| xxlor vs3, vs35, vs35 | |||
| xxlor vs4, vs36, vs36 | |||
| xxlor vs5, vs37, vs37 | |||
| xxlor vs6, vs38, vs38 | |||
| xxlor vs7, vs39, vs39 | |||
| xxlor vs8, vs40, vs40 | |||
| xxlor vs9, vs41, vs41 | |||
| xxlor vs10, vs42, vs42 | |||
| xxlor vs11, vs43, vs43 | |||
| xxlor vs12, vs44, vs44 | |||
| xxlor vs13, vs45, vs45 | |||
| xxlor vs14, vs46, vs46 | |||
| xxlor vs15, vs47, vs47 | |||
| #else | |||
| xxlor vs2, vs32, vs32 | |||
| xxlor vs3, vs33, vs33 | |||
| xxlor vs0, vs34, vs34 | |||
| @@ -322,7 +399,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxlor vs15, vs45, vs45 | |||
| xxlor vs12, vs46, vs46 | |||
| xxlor vs13, vs47, vs47 | |||
| #endif | |||
| xxpermdi vs32, vs16, vs17, 0b01 | |||
| xxpermdi vs33, vs16, vs17, 0b10 | |||
| xxpermdi vs34, vs18, vs19, 0b01 | |||
| @@ -339,7 +416,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs45, vs28, vs29, 0b10 | |||
| xxpermdi vs46, vs30, vs31, 0b01 | |||
| xxpermdi vs47, vs30, vs31, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs16, vs32, vs32 | |||
| xxlor vs17, vs33, vs33 | |||
| xxlor vs18, vs34, vs34 | |||
| xxlor vs19, vs35, vs35 | |||
| xxlor vs20, vs36, vs36 | |||
| xxlor vs21, vs37, vs37 | |||
| xxlor vs22, vs38, vs38 | |||
| xxlor vs23, vs39, vs39 | |||
| xxlor vs24, vs40, vs40 | |||
| xxlor vs25, vs41, vs41 | |||
| xxlor vs26, vs42, vs42 | |||
| xxlor vs27, vs43, vs43 | |||
| xxlor vs28, vs44, vs44 | |||
| xxlor vs29, vs45, vs45 | |||
| xxlor vs30, vs46, vs46 | |||
| xxlor vs31, vs47, vs47 | |||
| #else | |||
| xxlor vs18, vs32, vs32 | |||
| xxlor vs19, vs33, vs33 | |||
| xxlor vs16, vs34, vs34 | |||
| @@ -356,7 +450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxlor vs31, vs45, vs45 | |||
| xxlor vs28, vs46, vs46 | |||
| xxlor vs29, vs47, vs47 | |||
| #endif | |||
| SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 | |||
| SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0 | |||
| addi CO, CO, 128 | |||
| @@ -388,17 +482,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A | |||
| lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A | |||
| lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs32, vs48 | |||
| xvf64gerpp 3, vs34, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs34, vs48 | |||
| xvf64gerpp 2, vs32, vs49 | |||
| xvf64gerpp 3, vs34, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs32, vs48 | |||
| xvf64gerpp 3, vs34, vs48 | |||
| #endif | |||
| lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A | |||
| lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A | |||
| lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B | |||
| xvf64gerpp 0, vs40, vs51 | |||
| xvf64gerpp 1, vs42, vs51 | |||
| xvf64gerpp 2, vs40, vs50 | |||
| xvf64gerpp 3, vs42, vs50 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs40, vs50 | |||
| xvf64gerpp 1, vs42, vs50 | |||
| xvf64gerpp 2, vs40, vs51 | |||
| xvf64gerpp 3, vs42, vs51 | |||
| #else | |||
| xvf64gerpp 0, vs40, vs51 | |||
| xvf64gerpp 1, vs42, vs51 | |||
| xvf64gerpp 2, vs40, vs50 | |||
| xvf64gerpp 3, vs42, vs50 | |||
| #endif | |||
| .if \IsLast==1 | |||
| addi AO, AO, DISP8(\Index,128) | |||
| addi BO, BO, DISP4(\Index,64) | |||
| @@ -407,10 +515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD_END_2x4 OffsetA, OffsetB | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs32, vs48 | |||
| xvf64gerpp 3, vs34, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs34, vs48 | |||
| xvf64gerpp 2, vs32, vs49 | |||
| xvf64gerpp 3, vs34, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs32, vs48 | |||
| xvf64gerpp 3, vs34, vs48 | |||
| #endif | |||
| addi BO, BO, \OffsetB | |||
| addi AO, AO, \OffsetA | |||
| .endm | |||
| @@ -443,7 +558,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs45, vs12, vs13, 0b10 | |||
| xxpermdi vs46, vs14, vs15, 0b01 | |||
| xxpermdi vs47, vs14, vs15, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs0, vs32, vs32 | |||
| xxlor vs1, vs33, vs33 | |||
| xxlor vs2, vs34, vs34 | |||
| xxlor vs3, vs35, vs35 | |||
| xxlor vs4, vs36, vs36 | |||
| xxlor vs5, vs37, vs37 | |||
| xxlor vs6, vs38, vs38 | |||
| xxlor vs7, vs39, vs39 | |||
| xxlor vs8, vs40, vs40 | |||
| xxlor vs9, vs41, vs41 | |||
| xxlor vs10, vs42, vs42 | |||
| xxlor vs11, vs43, vs43 | |||
| xxlor vs12, vs44, vs44 | |||
| xxlor vs13, vs45, vs45 | |||
| xxlor vs14, vs46, vs46 | |||
| xxlor vs15, vs47, vs47 | |||
| #else | |||
| xxlor vs2, vs32, vs32 | |||
| xxlor vs3, vs33, vs33 | |||
| xxlor vs0, vs34, vs34 | |||
| @@ -460,7 +592,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxlor vs15, vs45, vs45 | |||
| xxlor vs12, vs46, vs46 | |||
| xxlor vs13, vs47, vs47 | |||
| #endif | |||
| SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 | |||
| SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0 | |||
| addi CO, CO, 64 | |||
| @@ -488,12 +620,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x2_2 Index, IsLast | |||
| lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A | |||
| lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs32, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs32, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs32, vs48 | |||
| #endif | |||
| lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A | |||
| lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B | |||
| xvf64gerpp 0, vs40, vs51 | |||
| xvf64gerpp 1, vs40, vs50 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs40, vs50 | |||
| xvf64gerpp 1, vs40, vs51 | |||
| #else | |||
| xvf64gerpp 0, vs40, vs51 | |||
| xvf64gerpp 1, vs40, vs50 | |||
| #endif | |||
| .if \IsLast==1 | |||
| addi AO, AO, DISP4(\Index,64) | |||
| addi BO, BO, DISP4(\Index,64) | |||
| @@ -502,8 +644,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD_END_2x2 OffsetA,OffsetB | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs32, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs32, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs32, vs48 | |||
| #endif | |||
| addi BO, BO, \OffsetB | |||
| addi AO, AO, \OffsetA | |||
| .endm | |||
| @@ -526,7 +673,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs37, vs4, vs5, 0b10 | |||
| xxpermdi vs38, vs6, vs7, 0b01 | |||
| xxpermdi vs39, vs6, vs7, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs0, vs32, vs32 | |||
| xxlor vs1, vs33, vs33 | |||
| xxlor vs2, vs34, vs34 | |||
| xxlor vs3, vs35, vs35 | |||
| xxlor vs4, vs36, vs36 | |||
| xxlor vs5, vs37, vs37 | |||
| xxlor vs6, vs38, vs38 | |||
| xxlor vs7, vs39, vs39 | |||
| #else | |||
| xxlor vs2, vs32, vs32 | |||
| xxlor vs3, vs33, vs33 | |||
| xxlor vs0, vs34, vs34 | |||
| @@ -535,7 +691,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxlor vs7, vs37, vs37 | |||
| xxlor vs4, vs38, vs38 | |||
| xxlor vs5, vs39, vs39 | |||
| #endif | |||
| SAVE2 vs0,vs1,vs2,vs3,CO,0 | |||
| SAVE2 vs4,vs5,vs6,vs7,T1,0 | |||
| addi CO, CO, 32 | |||
| @@ -702,14 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A | |||
| lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A | |||
| lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs36, vs49 | |||
| xvf64gerpp 3, vs38, vs49 | |||
| xvf64gerpp 0, vs40, vs48 | |||
| xvf64gerpp 1, vs42, vs48 | |||
| xvf64gerpp 2, vs44, vs48 | |||
| xvf64gerpp 3, vs46, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs34, vs48 | |||
| xvf64gerpp 2, vs36, vs48 | |||
| xvf64gerpp 3, vs38, vs48 | |||
| xvf64gerpp 0, vs40, vs49 | |||
| xvf64gerpp 1, vs42, vs49 | |||
| xvf64gerpp 2, vs44, vs49 | |||
| xvf64gerpp 3, vs46, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs36, vs49 | |||
| xvf64gerpp 3, vs38, vs49 | |||
| xvf64gerpp 0, vs40, vs48 | |||
| xvf64gerpp 1, vs42, vs48 | |||
| xvf64gerpp 2, vs44, vs48 | |||
| xvf64gerpp 3, vs46, vs48 | |||
| #endif | |||
| .if \IsLast==1 | |||
| addi AO, AO, DISP16(\Index,256) | |||
| addi BO, BO, DISP2(\Index,32) | |||
| @@ -758,7 +925,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs45, vs12, vs13, 0b10 | |||
| xxpermdi vs46, vs14, vs15, 0b01 | |||
| xxpermdi vs47, vs14, vs15, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs0, vs32, vs32 | |||
| xxlor vs1, vs33, vs33 | |||
| xxlor vs2, vs34, vs34 | |||
| xxlor vs3, vs35, vs35 | |||
| xxlor vs4, vs36, vs36 | |||
| xxlor vs5, vs37, vs37 | |||
| xxlor vs6, vs38, vs38 | |||
| xxlor vs7, vs39, vs39 | |||
| xxlor vs8, vs40, vs40 | |||
| xxlor vs9, vs41, vs41 | |||
| xxlor vs10, vs42, vs42 | |||
| xxlor vs11, vs43, vs43 | |||
| xxlor vs12, vs44, vs44 | |||
| xxlor vs13, vs45, vs45 | |||
| xxlor vs14, vs46, vs46 | |||
| xxlor vs15, vs47, vs47 | |||
| #else | |||
| xxlor vs2, vs32, vs32 | |||
| xxlor vs3, vs33, vs33 | |||
| xxlor vs0, vs34, vs34 | |||
| @@ -775,7 +959,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxlor vs15, vs45, vs45 | |||
| xxlor vs12, vs46, vs46 | |||
| xxlor vs13, vs47, vs47 | |||
| #endif | |||
| SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 | |||
| addi CO, CO, 128 | |||
| .endm | |||
| @@ -799,10 +983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A | |||
| lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A | |||
| lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 0, vs40, vs48 | |||
| xvf64gerpp 1, vs42, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs34, vs48 | |||
| xvf64gerpp 0, vs40, vs49 | |||
| xvf64gerpp 1, vs42, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 0, vs40, vs48 | |||
| xvf64gerpp 1, vs42, vs48 | |||
| #endif | |||
| .if \IsLast==1 | |||
| addi AO, AO, DISP8(\Index,128) | |||
| addi BO, BO, DISP2(\Index,32) | |||
| @@ -837,7 +1028,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs37, vs4, vs5, 0b10 | |||
| xxpermdi vs38, vs6, vs7, 0b01 | |||
| xxpermdi vs39, vs6, vs7, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs0, vs32, vs32 | |||
| xxlor vs1, vs33, vs33 | |||
| xxlor vs2, vs34, vs34 | |||
| xxlor vs3, vs35, vs35 | |||
| xxlor vs4, vs36, vs36 | |||
| xxlor vs5, vs37, vs37 | |||
| xxlor vs6, vs38, vs38 | |||
| xxlor vs7, vs39, vs39 | |||
| #else | |||
| xxlor vs2, vs32, vs32 | |||
| xxlor vs3, vs33, vs33 | |||
| xxlor vs0, vs34, vs34 | |||
| @@ -846,7 +1046,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxlor vs7, vs37, vs37 | |||
| xxlor vs4, vs38, vs38 | |||
| xxlor vs5, vs39, vs39 | |||
| #endif | |||
| SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 | |||
| addi CO, CO, 64 | |||
| .endm | |||
| @@ -867,8 +1067,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A | |||
| lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A | |||
| lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 0, vs40, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 0, vs40, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 0, vs40, vs48 | |||
| #endif | |||
| .if \IsLast==1 | |||
| addi AO, AO, DISP4(\Index,64) | |||
| addi BO, BO, DISP2(\Index,32) | |||
| @@ -896,11 +1101,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs33, vs0, vs1, 0b10 | |||
| xxpermdi vs34, vs2, vs3, 0b01 | |||
| xxpermdi vs35, vs2, vs3, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs0, vs32, vs32 | |||
| xxlor vs1, vs33, vs33 | |||
| xxlor vs2, vs34, vs34 | |||
| xxlor vs3, vs35, vs35 | |||
| #else | |||
| xxlor vs2, vs32, vs32 | |||
| xxlor vs3, vs33, vs33 | |||
| xxlor vs0, vs34, vs34 | |||
| xxlor vs1, vs35, vs35 | |||
| #endif | |||
| SAVE2 vs0,vs1,vs2,vs3,CO,0 | |||
| addi CO, CO, 32 | |||
| @@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #elif HAVE_KERNEL_4x4_VEC | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| typedef __vector unsigned char vec_t; | |||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||
| @@ -43,16 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #include "zscal_microk_power8.c" | |||
| #endif | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #elif defined(POWER10) | |||
| #if defined(DOUBLE) | |||
| #include "zscal_microk_power10.c" | |||
| #else | |||
| #include "cscal_microk_power10.c" | |||
| #endif | |||
| #elif defined(POWER10) | |||
| #if defined(DOUBLE) | |||
| #include "zscal_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #endif | |||
| @@ -42,7 +42,11 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) | |||
| "xsnegdp 33, %x10 \n\t" // -alpha_i | |||
| XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(33,33, %x10) // -alpha_i , alpha_i | |||
| #else | |||
| XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i | |||
| #endif | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| @@ -97,10 +101,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) | |||
| "xvadddp 49, 49, 39 \n\t" | |||
| "xvadddp 50, 50, %x3 \n\t" | |||
| "xvadddp 51, 51, %x4 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 0(%2) \n\t" | |||
| "stxv 49, 16(%2) \n\t" | |||
| "stxv 50, 32(%2) \n\t" | |||
| "stxv 51, 48(%2) \n\t" | |||
| #else | |||
| "stxv 49, 0(%2) \n\t" | |||
| "stxv 48, 16(%2) \n\t" | |||
| "stxv 51, 32(%2) \n\t" | |||
| "stxv 50, 48(%2) \n\t" | |||
| #endif | |||
| "xvadddp 34, 34, %x5 \n\t" | |||
| @@ -109,12 +120,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) | |||
| "xvadddp 36, 36, %x7 \n\t" | |||
| "xvadddp 37, 37, %x8 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 34, 64(%2) \n\t" | |||
| "stxv 35, 80(%2) \n\t" | |||
| "stxv 36, 96(%2) \n\t" | |||
| "stxv 37, 112(%2) \n\t" | |||
| #else | |||
| "stxv 35, 64(%2) \n\t" | |||
| "stxv 34, 80(%2) \n\t" | |||
| "stxv 37, 96(%2) \n\t" | |||
| "stxv 36, 112(%2) \n\t" | |||
| #endif | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %1, %1, -8 \n\t" | |||
| @@ -155,23 +171,34 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) | |||
| "xvadddp 50, 50, %x3 \n\t" | |||
| "xvadddp 51, 51, %x4 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 0(%2) \n\t" | |||
| "stxv 49, 16(%2) \n\t" | |||
| "stxv 50, 32(%2) \n\t" | |||
| "stxv 51, 48(%2) \n\t" | |||
| #else | |||
| "stxv 49, 0(%2) \n\t" | |||
| "stxv 48, 16(%2) \n\t" | |||
| "stxv 51, 32(%2) \n\t" | |||
| "stxv 50, 48(%2) \n\t" | |||
| #endif | |||
| "xvadddp 34, 34, %x5 \n\t" | |||
| "xvadddp 35, 35, %x6 \n\t" | |||
| "xvadddp 36, 36, %x7 \n\t" | |||
| "xvadddp 37, 37, %x8 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 34, 64(%2) \n\t" | |||
| "stxv 35, 80(%2) \n\t" | |||
| "stxv 36, 96(%2) \n\t" | |||
| "stxv 37, 112(%2) \n\t" | |||
| #else | |||
| "stxv 35, 64(%2) \n\t" | |||
| "stxv 34, 80(%2) \n\t" | |||
| "stxv 37, 96(%2) \n\t" | |||
| "stxv 36, 112(%2) \n\t" | |||
| #endif | |||
| "#n=%1 x=%0=%2 alpha=(%9,%10) \n" | |||
| : | |||
| "+m" (*x), | |||
| @@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "zswap_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #elif defined(POWER10) | |||
| #include "cswap_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "zswap_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| @@ -2465,13 +2465,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| #define DGEMM_DEFAULT_UNROLL_M 16 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #else | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #endif | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | |||